diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,74043 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "avg_penalty/after_target": 2.6603511571884155, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.8065536916255951, + "avg_penalty/before_think": 0.8721397258341312, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 967.25, + "completions/max_terminated_length": 913.5, + "completions/mean_length": 695.625, + "completions/mean_terminated_length": 637.5263824462891, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "epoch": 0.0005, + "grad_norm": 0.8059782981872559, + "kl": 0.0, + "learning_rate": 0.0, + "loss": 0.0133, + "num_tokens": 53864.0, + "reward": 0.2421875, + "reward_std": 0.45039617270231247, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.11180340498685837, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.1484375, + "rewards/tag_count_reward/std": 0.3200250416994095, + "step": 1, + "token_counts/after_target": 4956.0, + "token_counts/after_think": 190.5, + "token_counts/before_target": 5554.0, + "token_counts/before_think": 429.5 + }, + { + "avg_penalty/after_target": 2.495770275592804, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 0.6772881373763084, + "avg_penalty/before_think": 0.12845617532730103, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 858.25, + "completions/max_terminated_length": 852.25, + "completions/mean_length": 593.46875, + "completions/mean_terminated_length": 583.8660736083984, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.001, + "grad_norm": 0.7978835105895996, + "kl": 0.0, + "learning_rate": 1.0000000000000001e-07, + "loss": -0.0342, + "num_tokens": 100182.0, + "reward": 0.08984375, + "reward_std": 0.15512817353010178, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.05859375, + "rewards/tag_count_reward/std": 0.13377772271633148, + "step": 2, + "token_counts/after_target": 3809.0, + "token_counts/after_think": 0.0, + "token_counts/before_target": 5446.75, + "token_counts/before_think": 239.75 + }, + { + "avg_penalty/after_target": 2.334700107574463, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.6689730435609818, + "avg_penalty/before_think": 0.61893230676651, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 1006.5, + "completions/max_terminated_length": 887.5, + "completions/mean_length": 630.6875, + "completions/mean_terminated_length": 583.4043350219727, + "completions/min_length": 252.25, + "completions/min_terminated_length": 252.25, + "epoch": 0.0015, + "grad_norm": 0.6297087669372559, + "kl": 0.0004048347473144531, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.0315, + "num_tokens": 151394.0, + "reward": 0.0546875, + "reward_std": 0.14018996804952621, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0546875, + "rewards/tag_count_reward/std": 0.14018996804952621, + "step": 3, + "token_counts/after_target": 3937.5, + "token_counts/after_think": 133.5, + "token_counts/before_target": 5785.25, + "token_counts/before_think": 234.75 + }, + { + "avg_penalty/after_target": 2.6564077734947205, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.9010032489895821, + "avg_penalty/before_think": 0.5155139416456223, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 924.25, + "completions/max_terminated_length": 879.25, + "completions/mean_length": 723.0625, + "completions/mean_terminated_length": 625.9492645263672, + "completions/min_length": 353.0, + "completions/min_terminated_length": 353.0, + "epoch": 0.002, + "grad_norm": 0.5361136198043823, + "kl": 0.0002999305725097656, + "learning_rate": 3.0000000000000004e-07, + "loss": 0.0491, + "num_tokens": 207606.0, + "reward": 0.10546875, + "reward_std": 0.23757922649383545, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.08984375, + "rewards/tag_count_reward/std": 0.19700118899345398, + "step": 4, + "token_counts/after_target": 5455.0, + "token_counts/after_think": 7.75, + "token_counts/before_target": 5625.0, + "token_counts/before_think": 481.25 + }, + { + "avg_penalty/after_target": 2.5434232354164124, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 0.7979010194540024, + "avg_penalty/before_think": 0.22744664922356606, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 982.5, + "completions/max_terminated_length": 947.0, + "completions/mean_length": 636.390625, + "completions/mean_terminated_length": 607.4666748046875, + "completions/min_length": 291.75, + "completions/min_terminated_length": 291.75, + "epoch": 0.0025, + "grad_norm": 0.5623772740364075, + "kl": 0.00037479400634765625, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.001, + "num_tokens": 257631.0, + "reward": 0.265625, + "reward_std": 0.3265390247106552, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.11967839300632477, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.14789126068353653, + "rewards/tag_count_reward/mean": 0.140625, + "rewards/tag_count_reward/std": 0.1766088679432869, + "step": 5, + "token_counts/after_target": 4377.5, + "token_counts/after_think": 0.0, + "token_counts/before_target": 5439.5, + "token_counts/before_think": 365.25 + }, + { + "avg_penalty/after_target": 2.4222171902656555, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 0.8015927374362946, + "avg_penalty/before_think": 0.15816621482372284, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 875.75, + "completions/max_terminated_length": 866.25, + "completions/mean_length": 744.125, + "completions/mean_terminated_length": 712.7620239257812, + "completions/min_length": 480.75, + "completions/min_terminated_length": 480.75, + "epoch": 0.003, + "grad_norm": 0.6268067955970764, + "kl": 0.00033092498779296875, + "learning_rate": 5.000000000000001e-07, + "loss": -0.0281, + "num_tokens": 314631.0, + "reward": 0.125, + "reward_std": 0.26179538667201996, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.078125, + "rewards/tag_count_reward/std": 0.1372452899813652, + "step": 6, + "token_counts/after_target": 5313.0, + "token_counts/after_think": 0.0, + "token_counts/before_target": 6066.0, + "token_counts/before_think": 527.0 + }, + { + "avg_penalty/after_target": 2.526650846004486, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.6202300488948822, + "avg_penalty/before_think": 0.761177271604538, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 816.25, + "completions/max_terminated_length": 804.0, + "completions/mean_length": 518.0, + "completions/mean_terminated_length": 512.4458465576172, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.0035, + "grad_norm": 1.027683138847351, + "kl": 0.0003800392150878906, + "learning_rate": 6.000000000000001e-07, + "loss": 0.0183, + "num_tokens": 356999.0, + "reward": 0.359375, + "reward_std": 0.5262051373720169, + "rewards/accuracy_reward/mean": NaN, + "rewards/accuracy_reward/std": NaN, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.2257782220840454, + "rewards/tag_count_reward/mean": 0.15625, + "rewards/tag_count_reward/std": 0.3265710473060608, + "step": 7, + "token_counts/after_target": 2909.25, + "token_counts/after_think": 43.5, + "token_counts/before_target": 4894.25, + "token_counts/before_think": 441.0 + }, + { + "avg_penalty/after_target": 2.334340453147888, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 0.6723927184939384, + "avg_penalty/before_think": 0.5957894772291183, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 924.5, + "completions/max_terminated_length": 921.25, + "completions/mean_length": 635.0625, + "completions/mean_terminated_length": 625.2589416503906, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.004, + "grad_norm": 0.6485996246337891, + "kl": 0.0003352165222167969, + "learning_rate": 7.000000000000001e-07, + "loss": -0.0388, + "num_tokens": 406539.0, + "reward": 0.12890625, + "reward_std": 0.38875430822372437, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.1875, + "rewards/tag_count_reward/mean": 0.08203125, + "rewards/tag_count_reward/std": 0.21860739588737488, + "step": 8, + "token_counts/after_target": 3730.5, + "token_counts/after_think": 0.0, + "token_counts/before_target": 5642.25, + "token_counts/before_think": 788.25 + }, + { + "avg_penalty/after_target": 2.3163277208805084, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.8764293640851974, + "avg_penalty/before_think": 1.2367396242916584, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 991.75, + "completions/max_terminated_length": 976.75, + "completions/mean_length": 717.25, + "completions/mean_terminated_length": 673.5586853027344, + "completions/min_length": 217.75, + "completions/min_terminated_length": 217.75, + "epoch": 0.0045, + "grad_norm": 0.6452906131744385, + "kl": 0.0003638267517089844, + "learning_rate": 8.000000000000001e-07, + "loss": 0.0473, + "num_tokens": 464267.0, + "reward": 0.13671875, + "reward_std": 0.35908643901348114, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.10546875, + "rewards/tag_count_reward/std": 0.23505647480487823, + "step": 9, + "token_counts/after_target": 5021.25, + "token_counts/after_think": 223.0, + "token_counts/before_target": 5882.25, + "token_counts/before_think": 349.5 + }, + { + "avg_penalty/after_target": 2.3527349829673767, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.8759422302246094, + "avg_penalty/before_think": 0.45838284492492676, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 913.0, + "completions/max_terminated_length": 889.5, + "completions/mean_length": 690.265625, + "completions/mean_terminated_length": 649.5310211181641, + "completions/min_length": 420.5, + "completions/min_terminated_length": 420.5, + "epoch": 0.005, + "grad_norm": 0.632891058921814, + "kl": 0.0003542900085449219, + "learning_rate": 9.000000000000001e-07, + "loss": 0.0613, + "num_tokens": 519836.0, + "reward": 0.48828125, + "reward_std": 0.3618091940879822, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.2675696536898613, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.14789126068353653, + "rewards/tag_count_reward/mean": 0.09765625, + "rewards/tag_count_reward/std": 0.23980027437210083, + "step": 10, + "token_counts/after_target": 4877.5, + "token_counts/after_think": 68.5, + "token_counts/before_target": 5293.75, + "token_counts/before_think": 804.5 + }, + { + "avg_penalty/after_target": 2.4828290343284607, + "avg_penalty/after_think": 1.7294525504112244, + "avg_penalty/before_target": 0.71388353779912, + "avg_penalty/before_think": 0.478736013174057, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 855.25, + "completions/max_terminated_length": 826.5, + "completions/mean_length": 633.0, + "completions/mean_terminated_length": 587.7031402587891, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.0055, + "grad_norm": 0.9686952829360962, + "kl": 0.0005650520324707031, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.0173, + "num_tokens": 569228.0, + "reward": 0.2734375, + "reward_std": 0.4508705697953701, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.2109375, + "rewards/tag_count_reward/std": 0.3581989072263241, + "step": 11, + "token_counts/after_target": 4050.5, + "token_counts/after_think": 154.25, + "token_counts/before_target": 4879.25, + "token_counts/before_think": 1044.0 + }, + { + "avg_penalty/after_target": 2.6559526324272156, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.8321611136198044, + "avg_penalty/before_think": 1.7482365146279335, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 999.5, + "completions/max_terminated_length": 983.25, + "completions/mean_length": 692.421875, + "completions/mean_terminated_length": 644.4693756103516, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "epoch": 0.006, + "grad_norm": 0.8116323351860046, + "kl": 0.000614166259765625, + "learning_rate": 1.1e-06, + "loss": 0.0503, + "num_tokens": 621607.0, + "reward": 0.23046875, + "reward_std": 0.46706516668200493, + "rewards/accuracy_reward/mean": NaN, + "rewards/accuracy_reward/std": NaN, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.14789126068353653, + "rewards/tag_count_reward/mean": 0.18359375, + "rewards/tag_count_reward/std": 0.3446059189736843, + "step": 12, + "token_counts/after_target": 5226.5, + "token_counts/after_think": 53.5, + "token_counts/before_target": 5409.75, + "token_counts/before_think": 389.0 + }, + { + "avg_penalty/after_target": 2.8002877235412598, + "avg_penalty/after_think": 2.238015726208687, + "avg_penalty/before_target": 0.878023311495781, + "avg_penalty/before_think": 0.599238894879818, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 991.5, + "completions/max_terminated_length": 853.25, + "completions/mean_length": 616.3125, + "completions/mean_terminated_length": 586.0524291992188, + "completions/min_length": 260.5, + "completions/min_terminated_length": 260.5, + "epoch": 0.0065, + "grad_norm": 0.914971113204956, + "kl": 0.0012950897216796875, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.1064, + "num_tokens": 670075.0, + "reward": 0.671875, + "reward_std": 0.7568278685212135, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.21347815543413162, + "rewards/format_reward/mean": 0.15625, + "rewards/format_reward/std": 0.33539126068353653, + "rewards/tag_count_reward/mean": 0.375, + "rewards/tag_count_reward/std": 0.4319520816206932, + "step": 13, + "token_counts/after_target": 4047.0, + "token_counts/after_think": 29.25, + "token_counts/before_target": 3941.75, + "token_counts/before_think": 1843.0 + }, + { + "avg_penalty/after_target": 2.3514966666698456, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.9773078858852386, + "avg_penalty/before_think": 0.7162721753120422, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 903.25, + "completions/max_terminated_length": 848.5, + "completions/mean_length": 718.5, + "completions/mean_terminated_length": 658.8113250732422, + "completions/min_length": 316.25, + "completions/min_terminated_length": 316.25, + "epoch": 0.007, + "grad_norm": 0.8085166215896606, + "kl": 0.0018749237060546875, + "learning_rate": 1.3e-06, + "loss": 0.1005, + "num_tokens": 723803.0, + "reward": 0.46484375, + "reward_std": 0.5654404014348984, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.09375, + "rewards/format_reward/std": 0.20155644416809082, + "rewards/tag_count_reward/mean": 0.37109375, + "rewards/tag_count_reward/std": 0.4071407690644264, + "step": 14, + "token_counts/after_target": 5038.75, + "token_counts/after_think": 54.0, + "token_counts/before_target": 4766.5, + "token_counts/before_think": 1636.75 + }, + { + "avg_penalty/after_target": 2.3507620096206665, + "avg_penalty/after_think": 2.537249505519867, + "avg_penalty/before_target": 0.6179757416248322, + "avg_penalty/before_think": 0.6932187527418137, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 905.5, + "completions/max_terminated_length": 880.75, + "completions/mean_length": 523.5, + "completions/mean_terminated_length": 504.35462951660156, + "completions/min_length": 120.5, + "completions/min_terminated_length": 120.5, + "epoch": 0.0075, + "grad_norm": 0.961773693561554, + "kl": 0.0026569366455078125, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.0677, + "num_tokens": 773627.0, + "reward": 1.1875, + "reward_std": 0.7407689988613129, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.21347815543413162, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4185478091239929, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.29194265231490135, + "step": 15, + "token_counts/after_target": 1666.75, + "token_counts/after_think": 543.25, + "token_counts/before_target": 2424.5, + "token_counts/before_think": 3741.5 + }, + { + "avg_penalty/after_target": 2.5061271488666534, + "avg_penalty/after_think": 3.9800102710723877, + "avg_penalty/before_target": 0.8539560958743095, + "avg_penalty/before_think": 0.5834926068782806, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 935.75, + "completions/max_terminated_length": 912.25, + "completions/mean_length": 493.40625, + "completions/mean_terminated_length": 485.18125915527344, + "completions/min_length": 116.25, + "completions/min_terminated_length": 116.25, + "epoch": 0.008, + "grad_norm": 0.8652283549308777, + "kl": 0.01361083984375, + "learning_rate": 1.5e-06, + "loss": 0.0261, + "num_tokens": 813893.0, + "reward": 1.33984375, + "reward_std": 0.6350302994251251, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.11967839300632477, + "rewards/format_reward/mean": 0.4375, + "rewards/format_reward/std": 0.468590147793293, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.27169322222471237, + "step": 16, + "token_counts/after_target": 1417.25, + "token_counts/after_think": 755.0, + "token_counts/before_target": 1498.0, + "token_counts/before_think": 4224.25 + }, + { + "avg_penalty/after_target": 2.25075227022171, + "avg_penalty/after_think": 3.47831392288208, + "avg_penalty/before_target": 0.9385691583156586, + "avg_penalty/before_think": 0.7955731004476547, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 868.25, + "completions/mean_length": 688.78125, + "completions/mean_terminated_length": 589.5389022827148, + "completions/min_length": 180.75, + "completions/min_terminated_length": 180.75, + "epoch": 0.0085, + "grad_norm": 0.758905827999115, + "kl": 0.00557708740234375, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.1294, + "num_tokens": 866871.0, + "reward": 1.09765625, + "reward_std": 0.6834207326173782, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.328125, + "rewards/format_reward/std": 0.4692344516515732, + "rewards/tag_count_reward/mean": 0.75390625, + "rewards/tag_count_reward/std": 0.2634778842329979, + "step": 17, + "token_counts/after_target": 3471.5, + "token_counts/after_think": 559.25, + "token_counts/before_target": 2873.5, + "token_counts/before_think": 4116.25 + }, + { + "avg_penalty/after_target": 2.2419813573360443, + "avg_penalty/after_think": 3.979794681072235, + "avg_penalty/before_target": 0.6531802341341972, + "avg_penalty/before_think": 0.8598224520683289, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 858.75, + "completions/max_terminated_length": 840.75, + "completions/mean_length": 528.046875, + "completions/mean_terminated_length": 503.53558349609375, + "completions/min_length": 141.5, + "completions/min_terminated_length": 141.5, + "epoch": 0.009, + "grad_norm": 1.0489574670791626, + "kl": 0.006153106689453125, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.1064, + "num_tokens": 908426.0, + "reward": 1.4765625, + "reward_std": 0.6743074357509613, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.47865550220012665, + "rewards/tag_count_reward/mean": 0.8359375, + "rewards/tag_count_reward/std": 0.24073662236332893, + "step": 18, + "token_counts/after_target": 2230.25, + "token_counts/after_think": 281.5, + "token_counts/before_target": 2541.25, + "token_counts/before_think": 3395.75 + }, + { + "avg_penalty/after_target": 2.0656306743621826, + "avg_penalty/after_think": 3.9372466802597046, + "avg_penalty/before_target": 0.9064598381519318, + "avg_penalty/before_think": 1.074721872806549, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 995.25, + "completions/max_terminated_length": 969.75, + "completions/mean_length": 673.28125, + "completions/mean_terminated_length": 608.2027435302734, + "completions/min_length": 211.75, + "completions/min_terminated_length": 211.75, + "epoch": 0.0095, + "grad_norm": 0.779958963394165, + "kl": 0.00843048095703125, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0986, + "num_tokens": 960428.0, + "reward": 1.26953125, + "reward_std": 0.7238347679376602, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.497555673122406, + "rewards/tag_count_reward/mean": 0.72265625, + "rewards/tag_count_reward/std": 0.3253634124994278, + "step": 19, + "token_counts/after_target": 3790.0, + "token_counts/after_think": 593.0, + "token_counts/before_target": 3952.75, + "token_counts/before_think": 2436.75 + }, + { + "avg_penalty/after_target": 2.6413190066814423, + "avg_penalty/after_think": 3.4781126379966736, + "avg_penalty/before_target": 0.59861109405756, + "avg_penalty/before_think": 0.6462263464927673, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 850.75, + "completions/max_terminated_length": 845.25, + "completions/mean_length": 458.859375, + "completions/mean_terminated_length": 407.9812545776367, + "completions/min_length": 71.25, + "completions/min_terminated_length": 71.25, + "epoch": 0.01, + "grad_norm": 1.0151487588882446, + "kl": 0.010345458984375, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0667, + "num_tokens": 998979.0, + "reward": 1.55859375, + "reward_std": 0.5702340230345726, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.83984375, + "rewards/tag_count_reward/std": 0.22813964635133743, + "step": 20, + "token_counts/after_target": 2065.0, + "token_counts/after_think": 352.0, + "token_counts/before_target": 1885.25, + "token_counts/before_think": 3039.5 + }, + { + "avg_penalty/after_target": 2.625555634498596, + "avg_penalty/after_think": 3.433610647916794, + "avg_penalty/before_target": 0.6690306439995766, + "avg_penalty/before_think": 0.8638944178819656, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 873.75, + "completions/max_terminated_length": 851.0, + "completions/mean_length": 468.484375, + "completions/mean_terminated_length": 446.4888458251953, + "completions/min_length": 130.25, + "completions/min_terminated_length": 130.25, + "epoch": 0.0105, + "grad_norm": 1.0594806671142578, + "kl": 0.011962890625, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.1009, + "num_tokens": 1038722.0, + "reward": 1.64453125, + "reward_std": 0.5712989270687103, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.32438503205776215, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.2941051125526428, + "step": 21, + "token_counts/after_target": 2185.5, + "token_counts/after_think": 134.75, + "token_counts/before_target": 3031.25, + "token_counts/before_think": 2144.25 + }, + { + "avg_penalty/after_target": 2.50067400932312, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.6493895947933197, + "avg_penalty/before_think": 0.8581817299127579, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 881.75, + "completions/max_terminated_length": 770.75, + "completions/mean_length": 364.28125, + "completions/mean_terminated_length": 310.61026763916016, + "completions/min_length": 86.5, + "completions/min_terminated_length": 86.5, + "epoch": 0.011, + "grad_norm": 1.4328420162200928, + "kl": 0.0189971923828125, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.3164, + "num_tokens": 1070788.0, + "reward": 1.56640625, + "reward_std": 0.5582901686429977, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.35648179799318314, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.3463866338133812, + "step": 22, + "token_counts/after_target": 1795.25, + "token_counts/after_think": 54.5, + "token_counts/before_target": 2682.0, + "token_counts/before_think": 1296.75 + }, + { + "avg_penalty/after_target": 2.2433803975582123, + "avg_penalty/after_think": 3.883265495300293, + "avg_penalty/before_target": 0.7544738873839378, + "avg_penalty/before_think": 0.6965024918317795, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 880.0, + "completions/max_terminated_length": 818.75, + "completions/mean_length": 420.125, + "completions/mean_terminated_length": 393.2074508666992, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.0115, + "grad_norm": 1.2829105854034424, + "kl": 0.01861572265625, + "learning_rate": 2.2e-06, + "loss": 0.2938, + "num_tokens": 1107596.0, + "reward": 1.77734375, + "reward_std": 0.47942350804805756, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.27289126068353653, + "rewards/tag_count_reward/mean": 0.85546875, + "rewards/tag_count_reward/std": 0.2789809852838516, + "step": 23, + "token_counts/after_target": 1599.5, + "token_counts/after_think": 525.75, + "token_counts/before_target": 1933.75, + "token_counts/before_think": 2663.0 + }, + { + "avg_penalty/after_target": 2.8758557438850403, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5004135146737099, + "avg_penalty/before_think": 0.7526647448539734, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.25, + "completions/max_terminated_length": 647.25, + "completions/mean_length": 314.4375, + "completions/mean_terminated_length": 314.4375, + "completions/min_length": 101.75, + "completions/min_terminated_length": 101.75, + "epoch": 0.012, + "grad_norm": 1.4737523794174194, + "kl": 0.045074462890625, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.071, + "num_tokens": 1138408.0, + "reward": 1.8984375, + "reward_std": 0.22354720532894135, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.1610472071915865, + "step": 24, + "token_counts/after_target": 1203.25, + "token_counts/after_think": 105.75, + "token_counts/before_target": 2388.0, + "token_counts/before_think": 1334.0 + }, + { + "avg_penalty/after_target": 2.8267096877098083, + "avg_penalty/after_think": 3.838174343109131, + "avg_penalty/before_target": 0.5252215415239334, + "avg_penalty/before_think": 0.6367672383785248, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 627.0, + "completions/max_terminated_length": 606.5, + "completions/mean_length": 289.90625, + "completions/mean_terminated_length": 252.796875, + "completions/min_length": 54.25, + "completions/min_terminated_length": 54.25, + "epoch": 0.0125, + "grad_norm": 1.6574913263320923, + "kl": 0.051055908203125, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.0369, + "num_tokens": 1168306.0, + "reward": 1.87109375, + "reward_std": 0.4784395694732666, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.11180340498685837, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3221946656703949, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.16690177470445633, + "step": 25, + "token_counts/after_target": 1505.0, + "token_counts/after_think": 57.75, + "token_counts/before_target": 1950.5, + "token_counts/before_think": 1125.25 + }, + { + "avg_penalty/after_target": 3.080408215522766, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4639463350176811, + "avg_penalty/before_think": 0.6256660148501396, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 710.75, + "completions/max_terminated_length": 584.25, + "completions/mean_length": 285.875, + "completions/mean_terminated_length": 274.38645935058594, + "completions/min_length": 39.25, + "completions/min_terminated_length": 39.25, + "epoch": 0.013, + "grad_norm": 1.6387139558792114, + "kl": 0.036163330078125, + "learning_rate": 2.5e-06, + "loss": -0.0407, + "num_tokens": 1195610.0, + "reward": 1.84765625, + "reward_std": 0.41243139654397964, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.31116948276758194, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.1193853635340929, + "step": 26, + "token_counts/after_target": 1210.0, + "token_counts/after_think": 76.75, + "token_counts/before_target": 1827.75, + "token_counts/before_think": 1459.5 + }, + { + "avg_penalty/after_target": 2.3724108934402466, + "avg_penalty/after_think": 3.7581411600112915, + "avg_penalty/before_target": 0.5363590121269226, + "avg_penalty/before_think": 0.5742836445569992, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 649.75, + "completions/max_terminated_length": 573.0, + "completions/mean_length": 251.046875, + "completions/mean_terminated_length": 213.94711685180664, + "completions/min_length": 60.75, + "completions/min_terminated_length": 60.75, + "epoch": 0.0135, + "grad_norm": 1.8711671829223633, + "kl": 0.0373382568359375, + "learning_rate": 2.6e-06, + "loss": 0.3004, + "num_tokens": 1221693.0, + "reward": 1.91796875, + "reward_std": 0.43868668377399445, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.11967839300632477, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.17430340498685837, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.2252109795808792, + "step": 27, + "token_counts/after_target": 893.75, + "token_counts/after_think": 237.25, + "token_counts/before_target": 1205.0, + "token_counts/before_think": 1680.75 + }, + { + "avg_penalty/after_target": 2.4586858451366425, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.47475640848279, + "avg_penalty/before_think": 0.4516104683279991, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 701.5, + "completions/max_terminated_length": 642.75, + "completions/mean_length": 263.4375, + "completions/mean_terminated_length": 251.86771392822266, + "completions/min_length": 54.5, + "completions/min_terminated_length": 54.5, + "epoch": 0.014, + "grad_norm": 1.342960000038147, + "kl": 0.046875, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.1338, + "num_tokens": 1249033.0, + "reward": 1.97265625, + "reward_std": 0.43248794972896576, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.1875, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.23328252136707306, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.1481337696313858, + "step": 28, + "token_counts/after_target": 800.75, + "token_counts/after_think": 33.5, + "token_counts/before_target": 2004.75, + "token_counts/before_think": 1376.0 + }, + { + "avg_penalty/after_target": 2.5293178856372833, + "avg_penalty/after_think": 3.857387959957123, + "avg_penalty/before_target": 0.3591475822031498, + "avg_penalty/before_think": 0.511518120765686, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.5, + "completions/max_terminated_length": 487.5, + "completions/mean_length": 211.578125, + "completions/mean_terminated_length": 211.578125, + "completions/min_length": 58.25, + "completions/min_terminated_length": 58.25, + "epoch": 0.0145, + "grad_norm": 2.0882577896118164, + "kl": 0.03814697265625, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0417, + "num_tokens": 1274670.0, + "reward": 1.99609375, + "reward_std": 0.015625, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.015625, + "step": 29, + "token_counts/after_target": 503.0, + "token_counts/after_think": 135.75, + "token_counts/before_target": 1400.0, + "token_counts/before_think": 1346.5 + }, + { + "avg_penalty/after_target": 1.930746704339981, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.589110940694809, + "avg_penalty/before_think": 0.6263491287827492, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 767.25, + "completions/max_terminated_length": 679.25, + "completions/mean_length": 263.796875, + "completions/mean_terminated_length": 238.27232360839844, + "completions/min_length": 66.25, + "completions/min_terminated_length": 66.25, + "epoch": 0.015, + "grad_norm": 1.281786322593689, + "kl": 0.031494140625, + "learning_rate": 2.9e-06, + "loss": 0.1027, + "num_tokens": 1301537.0, + "reward": 1.99609375, + "reward_std": 0.4663008749485016, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.12909944355487823, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.24866948276758194, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.10717359744012356, + "step": 30, + "token_counts/after_target": 999.0, + "token_counts/after_think": 69.0, + "token_counts/before_target": 1797.5, + "token_counts/before_think": 1355.25 + }, + { + "avg_penalty/after_target": 2.8155108094215393, + "avg_penalty/after_think": 3.910028338432312, + "avg_penalty/before_target": 0.32582544162869453, + "avg_penalty/before_think": 0.4461340829730034, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.5, + "completions/max_terminated_length": 475.5, + "completions/mean_length": 201.546875, + "completions/mean_terminated_length": 201.546875, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.0155, + "grad_norm": 1.933602213859558, + "kl": 0.055328369140625, + "learning_rate": 3e-06, + "loss": -0.0267, + "num_tokens": 1327268.0, + "reward": 1.953125, + "reward_std": 0.1875, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.0625, + "step": 31, + "token_counts/after_target": 647.0, + "token_counts/after_think": 62.5, + "token_counts/before_target": 1031.25, + "token_counts/before_think": 1484.0 + }, + { + "avg_penalty/after_target": 2.1403911113739014, + "avg_penalty/after_think": 3.7047048211097717, + "avg_penalty/before_target": 0.5191575437784195, + "avg_penalty/before_think": 0.6447896808385849, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 718.0, + "completions/max_terminated_length": 679.25, + "completions/mean_length": 319.84375, + "completions/mean_terminated_length": 309.89583587646484, + "completions/min_length": 102.25, + "completions/min_terminated_length": 102.25, + "epoch": 0.016, + "grad_norm": 0.5082431435585022, + "kl": 0.028961181640625, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.0885, + "num_tokens": 1356234.0, + "reward": 1.98046875, + "reward_std": 0.078125, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.015625, + "step": 32, + "token_counts/after_target": 1190.75, + "token_counts/after_think": 59.25, + "token_counts/before_target": 2301.75, + "token_counts/before_think": 1565.75 + }, + { + "avg_penalty/after_target": 1.8697022795677185, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.32824769616127014, + "avg_penalty/before_think": 0.5182772353291512, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.25, + "completions/max_terminated_length": 640.25, + "completions/mean_length": 283.640625, + "completions/mean_terminated_length": 283.640625, + "completions/min_length": 65.5, + "completions/min_terminated_length": 65.5, + "epoch": 0.0165, + "grad_norm": 0.5919519662857056, + "kl": 0.0254974365234375, + "learning_rate": 3.2000000000000003e-06, + "loss": -0.0081, + "num_tokens": 1384211.0, + "reward": 2.015625, + "reward_std": 0.0625, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 33, + "token_counts/after_target": 541.75, + "token_counts/after_think": 193.0, + "token_counts/before_target": 1463.75, + "token_counts/before_think": 2339.75 + }, + { + "avg_penalty/after_target": 2.3474150598049164, + "avg_penalty/after_think": 3.7929523587226868, + "avg_penalty/before_target": 0.4563576653599739, + "avg_penalty/before_think": 0.6543823033571243, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 864.5, + "completions/max_terminated_length": 864.5, + "completions/mean_length": 375.703125, + "completions/mean_terminated_length": 375.703125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.017, + "grad_norm": 0.4397278130054474, + "kl": 0.02313232421875, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0089, + "num_tokens": 1417280.0, + "reward": 1.97265625, + "reward_std": 0.109375, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.046875, + "step": 34, + "token_counts/after_target": 1379.25, + "token_counts/after_think": 104.0, + "token_counts/before_target": 1883.5, + "token_counts/before_think": 2644.5 + }, + { + "avg_penalty/after_target": 2.4144216775894165, + "avg_penalty/after_think": 3.6769659519195557, + "avg_penalty/before_target": 0.44865279644727707, + "avg_penalty/before_think": 0.6940566450357437, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 906.5, + "completions/max_terminated_length": 832.75, + "completions/mean_length": 498.34375, + "completions/mean_terminated_length": 481.07293701171875, + "completions/min_length": 190.75, + "completions/min_terminated_length": 190.75, + "epoch": 0.0175, + "grad_norm": 0.5580074191093445, + "kl": 0.019866943359375, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.132, + "num_tokens": 1461542.0, + "reward": 1.9609375, + "reward_std": 0.15625, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.03125, + "step": 35, + "token_counts/after_target": 1212.0, + "token_counts/after_think": 709.25, + "token_counts/before_target": 1145.0, + "token_counts/before_think": 4907.25 + }, + { + "avg_penalty/after_target": 1.9997824132442474, + "avg_penalty/after_think": 3.907633662223816, + "avg_penalty/before_target": 0.39275994524359703, + "avg_penalty/before_think": 0.5883385688066483, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 698.25, + "completions/max_terminated_length": 654.0, + "completions/mean_length": 385.921875, + "completions/mean_terminated_length": 364.3269348144531, + "completions/min_length": 132.25, + "completions/min_terminated_length": 132.25, + "epoch": 0.018, + "grad_norm": 0.7553008198738098, + "kl": 0.0244293212890625, + "learning_rate": 3.5e-06, + "loss": 0.0898, + "num_tokens": 1496881.0, + "reward": 1.9140625, + "reward_std": 0.3109320253133774, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.2257782220840454, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.06520001962780952, + "step": 36, + "token_counts/after_target": 988.5, + "token_counts/after_think": 495.25, + "token_counts/before_target": 1049.5, + "token_counts/before_think": 3641.5 + }, + { + "avg_penalty/after_target": 2.487692356109619, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5487125366926193, + "avg_penalty/before_think": 0.6720221787691116, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 872.5, + "completions/max_terminated_length": 769.75, + "completions/mean_length": 497.40625, + "completions/mean_terminated_length": 478.1294708251953, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.0185, + "grad_norm": 3.7181501388549805, + "kl": 0.07012939453125, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.1074, + "num_tokens": 1536811.0, + "reward": 1.94140625, + "reward_std": 0.2242269217967987, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.10077822208404541, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.06442352384328842, + "step": 37, + "token_counts/after_target": 1536.5, + "token_counts/after_think": 440.5, + "token_counts/before_target": 1117.75, + "token_counts/before_think": 4863.75 + }, + { + "avg_penalty/after_target": 2.0312521159648895, + "avg_penalty/after_think": 3.9276122450828552, + "avg_penalty/before_target": 0.5713187530636787, + "avg_penalty/before_think": 0.6272246837615967, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 798.75, + "completions/max_terminated_length": 798.75, + "completions/mean_length": 433.0, + "completions/mean_terminated_length": 433.0, + "completions/min_length": 156.25, + "completions/min_terminated_length": 156.25, + "epoch": 0.019, + "grad_norm": 0.07096903026103973, + "kl": 0.01910400390625, + "learning_rate": 3.7e-06, + "loss": 0.0008, + "num_tokens": 1574059.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 38, + "token_counts/after_target": 763.75, + "token_counts/after_think": 488.5, + "token_counts/before_target": 644.5, + "token_counts/before_think": 5031.25 + }, + { + "avg_penalty/after_target": 2.220779061317444, + "avg_penalty/after_think": 3.7671355605125427, + "avg_penalty/before_target": 0.5380886048078537, + "avg_penalty/before_think": 0.5240800976753235, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 633.5, + "completions/max_terminated_length": 612.0, + "completions/mean_length": 368.40625, + "completions/mean_terminated_length": 360.828125, + "completions/min_length": 157.75, + "completions/min_terminated_length": 157.75, + "epoch": 0.0195, + "grad_norm": 0.3210369944572449, + "kl": 0.02569580078125, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0419, + "num_tokens": 1609141.0, + "reward": 1.98828125, + "reward_std": 0.12953995168209076, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.046875, + "step": 39, + "token_counts/after_target": 724.5, + "token_counts/after_think": 444.5, + "token_counts/before_target": 840.5, + "token_counts/before_think": 3885.0 + }, + { + "avg_penalty/after_target": 1.7631750702857971, + "avg_penalty/after_think": 3.9250295758247375, + "avg_penalty/before_target": 0.5079378932714462, + "avg_penalty/before_think": 0.6070213168859482, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 818.25, + "completions/max_terminated_length": 703.5, + "completions/mean_length": 413.078125, + "completions/mean_terminated_length": 401.55521392822266, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.02, + "grad_norm": 1.0533888339996338, + "kl": 0.0252685546875, + "learning_rate": 3.900000000000001e-06, + "loss": 0.1113, + "num_tokens": 1645338.0, + "reward": 1.9375, + "reward_std": 0.25, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.1875, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.0625, + "step": 40, + "token_counts/after_target": 745.25, + "token_counts/after_think": 611.25, + "token_counts/before_target": 846.25, + "token_counts/before_think": 4406.5 + }, + { + "avg_penalty/after_target": 2.1066566705703735, + "avg_penalty/after_think": 3.916032135486603, + "avg_penalty/before_target": 0.6627626121044159, + "avg_penalty/before_think": 0.6748418807983398, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 991.25, + "completions/max_terminated_length": 870.5, + "completions/mean_length": 514.09375, + "completions/mean_terminated_length": 489.45731353759766, + "completions/min_length": 214.25, + "completions/min_terminated_length": 214.25, + "epoch": 0.0205, + "grad_norm": 0.6255768537521362, + "kl": 0.0134429931640625, + "learning_rate": 4.000000000000001e-06, + "loss": 0.1278, + "num_tokens": 1686848.0, + "reward": 1.92578125, + "reward_std": 0.23947912454605103, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.14789126068353653, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.09528729319572449, + "step": 41, + "token_counts/after_target": 1639.75, + "token_counts/after_think": 562.75, + "token_counts/before_target": 1619.0, + "token_counts/before_think": 4404.0 + }, + { + "avg_penalty/after_target": 1.8689508736133575, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3279123231768608, + "avg_penalty/before_think": 0.656202107667923, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 758.0, + "completions/max_terminated_length": 758.0, + "completions/mean_length": 466.734375, + "completions/mean_terminated_length": 466.734375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.021, + "grad_norm": 0.39212313294410706, + "kl": 0.025482177734375, + "learning_rate": 4.1e-06, + "loss": -0.0064, + "num_tokens": 1729679.0, + "reward": 2.015625, + "reward_std": 0.0625, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 42, + "token_counts/after_target": 942.75, + "token_counts/after_think": 757.0, + "token_counts/before_target": 636.5, + "token_counts/before_think": 5131.5 + }, + { + "avg_penalty/after_target": 1.879249393939972, + "avg_penalty/after_think": 3.7608871459960938, + "avg_penalty/before_target": 0.42791344225406647, + "avg_penalty/before_think": 0.657097652554512, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 770.25, + "completions/max_terminated_length": 755.25, + "completions/mean_length": 460.546875, + "completions/mean_terminated_length": 455.3177185058594, + "completions/min_length": 159.25, + "completions/min_terminated_length": 159.25, + "epoch": 0.0215, + "grad_norm": 0.36569246649742126, + "kl": 0.02142333984375, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0285, + "num_tokens": 1767810.0, + "reward": 1.98046875, + "reward_std": 0.078125, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.015625, + "step": 43, + "token_counts/after_target": 1099.5, + "token_counts/after_think": 457.25, + "token_counts/before_target": 877.25, + "token_counts/before_think": 4934.75 + }, + { + "avg_penalty/after_target": 2.2495291233062744, + "avg_penalty/after_think": 3.8211018443107605, + "avg_penalty/before_target": 1.0947460383176804, + "avg_penalty/before_think": 0.7480838745832443, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 933.75, + "completions/mean_length": 685.921875, + "completions/mean_terminated_length": 623.7199478149414, + "completions/min_length": 357.75, + "completions/min_terminated_length": 357.75, + "epoch": 0.022, + "grad_norm": 0.8933461904525757, + "kl": 0.01397705078125, + "learning_rate": 4.3e-06, + "loss": 0.1567, + "num_tokens": 1824669.0, + "reward": 1.7421875, + "reward_std": 0.5917072892189026, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.3683478757739067, + "rewards/tag_count_reward/mean": 0.8984375, + "rewards/tag_count_reward/std": 0.20953459292650223, + "step": 44, + "token_counts/after_target": 2324.25, + "token_counts/after_think": 924.75, + "token_counts/before_target": 1350.25, + "token_counts/before_think": 6375.5 + }, + { + "avg_penalty/after_target": 1.4774491488933563, + "avg_penalty/after_think": 3.950641095638275, + "avg_penalty/before_target": 0.39612314105033875, + "avg_penalty/before_think": 0.6775318533182144, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 970.75, + "completions/max_terminated_length": 901.0, + "completions/mean_length": 581.484375, + "completions/mean_terminated_length": 567.1010589599609, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.0225, + "grad_norm": 0.6992694735527039, + "kl": 0.0156707763671875, + "learning_rate": 4.4e-06, + "loss": 0.1049, + "num_tokens": 1873036.0, + "reward": 1.9765625, + "reward_std": 0.21875, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.03125, + "step": 45, + "token_counts/after_target": 652.5, + "token_counts/after_think": 1017.0, + "token_counts/before_target": 443.25, + "token_counts/before_think": 7191.0 + }, + { + "avg_penalty/after_target": 2.3008689284324646, + "avg_penalty/after_think": 3.6240891218185425, + "avg_penalty/before_target": 0.7436505109071732, + "avg_penalty/before_think": 0.5943488106131554, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 797.0, + "completions/max_terminated_length": 786.25, + "completions/mean_length": 504.96875, + "completions/mean_terminated_length": 494.7991180419922, + "completions/min_length": 252.75, + "completions/min_terminated_length": 252.75, + "epoch": 0.023, + "grad_norm": 0.5017008781433105, + "kl": 0.016448974609375, + "learning_rate": 4.5e-06, + "loss": -0.014, + "num_tokens": 1914858.0, + "reward": 1.88671875, + "reward_std": 0.24890750646591187, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.17430340498685837, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.08017472177743912, + "step": 46, + "token_counts/after_target": 1267.0, + "token_counts/after_think": 397.25, + "token_counts/before_target": 867.25, + "token_counts/before_think": 5548.0 + }, + { + "avg_penalty/after_target": 1.881275862455368, + "avg_penalty/after_think": 3.7406960129737854, + "avg_penalty/before_target": 0.61200912296772, + "avg_penalty/before_think": 0.7217975482344627, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 837.75, + "completions/max_terminated_length": 837.25, + "completions/mean_length": 546.359375, + "completions/mean_terminated_length": 528.96875, + "completions/min_length": 282.5, + "completions/min_terminated_length": 282.5, + "epoch": 0.0235, + "grad_norm": 1.0727949142456055, + "kl": 0.0155487060546875, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0859, + "num_tokens": 1964689.0, + "reward": 1.8828125, + "reward_std": 0.4071238487958908, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.23680340498685837, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.1377599686384201, + "step": 47, + "token_counts/after_target": 1951.5, + "token_counts/after_think": 605.75, + "token_counts/before_target": 1583.0, + "token_counts/before_think": 4601.5 + }, + { + "avg_penalty/after_target": 2.0469044744968414, + "avg_penalty/after_think": 3.9011847376823425, + "avg_penalty/before_target": 0.44673267006874084, + "avg_penalty/before_think": 0.5079410523176193, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 575.0, + "completions/max_terminated_length": 575.0, + "completions/mean_length": 370.90625, + "completions/mean_terminated_length": 370.90625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.024, + "grad_norm": 1.009267807006836, + "kl": 0.033416748046875, + "learning_rate": 4.7e-06, + "loss": 0.0557, + "num_tokens": 1998875.0, + "reward": 2.015625, + "reward_std": 0.21325481683015823, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.11180340498685837, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.0625, + "step": 48, + "token_counts/after_target": 409.0, + "token_counts/after_think": 526.0, + "token_counts/before_target": 691.5, + "token_counts/before_think": 4308.0 + }, + { + "avg_penalty/after_target": 1.9882097244262695, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5475198104977608, + "avg_penalty/before_think": 0.6170122921466827, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 829.0, + "completions/max_terminated_length": 781.25, + "completions/mean_length": 489.96875, + "completions/mean_terminated_length": 482.234375, + "completions/min_length": 259.25, + "completions/min_terminated_length": 259.25, + "epoch": 0.0245, + "grad_norm": 0.7983074188232422, + "kl": 0.0181427001953125, + "learning_rate": 4.800000000000001e-06, + "loss": 0.1007, + "num_tokens": 2040441.0, + "reward": 1.91796875, + "reward_std": 0.328125, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.1875, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.140625, + "step": 49, + "token_counts/after_target": 707.0, + "token_counts/after_think": 630.25, + "token_counts/before_target": 853.5, + "token_counts/before_think": 5648.75 + }, + { + "avg_penalty/after_target": 1.7924420833587646, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.32500576972961426, + "avg_penalty/before_think": 0.6646995544433594, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 854.5, + "completions/max_terminated_length": 854.5, + "completions/mean_length": 523.96875, + "completions/mean_terminated_length": 523.96875, + "completions/min_length": 285.75, + "completions/min_terminated_length": 285.75, + "epoch": 0.025, + "grad_norm": 0.4345921576023102, + "kl": 0.0172882080078125, + "learning_rate": 4.9000000000000005e-06, + "loss": -0.0011, + "num_tokens": 2083559.0, + "reward": 2.09375, + "reward_std": 0.125, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.125, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 50, + "token_counts/after_target": 686.5, + "token_counts/after_think": 601.5, + "token_counts/before_target": 634.75, + "token_counts/before_think": 6460.75 + }, + { + "avg_penalty/after_target": 2.024613618850708, + "avg_penalty/after_think": 3.6287800073623657, + "avg_penalty/before_target": 0.7468253001570702, + "avg_penalty/before_think": 0.705940380692482, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 1017.25, + "completions/max_terminated_length": 916.75, + "completions/mean_length": 617.1875, + "completions/mean_terminated_length": 573.2175750732422, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.0255, + "grad_norm": 0.6576520204544067, + "kl": 0.01375579833984375, + "learning_rate": 5e-06, + "loss": 0.1411, + "num_tokens": 2135859.0, + "reward": 1.7578125, + "reward_std": 0.48615267872810364, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.3133598491549492, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.17603819072246552, + "step": 51, + "token_counts/after_target": 1627.25, + "token_counts/after_think": 816.25, + "token_counts/before_target": 1279.75, + "token_counts/before_think": 6151.75 + }, + { + "avg_penalty/after_target": 1.7005553841590881, + "avg_penalty/after_think": 3.8414027094841003, + "avg_penalty/before_target": 0.626686304807663, + "avg_penalty/before_think": 0.6855221837759018, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 948.25, + "completions/max_terminated_length": 854.0, + "completions/mean_length": 588.84375, + "completions/mean_terminated_length": 557.8739624023438, + "completions/min_length": 250.25, + "completions/min_terminated_length": 250.25, + "epoch": 0.026, + "grad_norm": 0.8806253671646118, + "kl": 0.015472412109375, + "learning_rate": 5.1e-06, + "loss": 0.0921, + "num_tokens": 2183353.0, + "reward": 1.85546875, + "reward_std": 0.47328995168209076, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.29930340498685837, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.1597641110420227, + "step": 52, + "token_counts/after_target": 1004.5, + "token_counts/after_think": 995.5, + "token_counts/before_target": 1059.0, + "token_counts/before_think": 6362.5 + }, + { + "avg_penalty/after_target": 2.251842826604843, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.616138394922018, + "avg_penalty/before_think": 0.7066884785890579, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 926.0, + "completions/max_terminated_length": 839.5, + "completions/mean_length": 554.703125, + "completions/mean_terminated_length": 539.0245666503906, + "completions/min_length": 294.75, + "completions/min_terminated_length": 294.75, + "epoch": 0.0265, + "grad_norm": 0.7359876036643982, + "kl": 0.019195556640625, + "learning_rate": 5.2e-06, + "loss": 0.1235, + "num_tokens": 2231302.0, + "reward": 1.93359375, + "reward_std": 0.20822912454605103, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.14789126068353653, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.06403729319572449, + "step": 53, + "token_counts/after_target": 1432.25, + "token_counts/after_think": 698.75, + "token_counts/before_target": 912.5, + "token_counts/before_think": 5831.75 + }, + { + "avg_penalty/after_target": 2.2055463790893555, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5432018935680389, + "avg_penalty/before_think": 0.7600558996200562, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 787.75, + "completions/max_terminated_length": 755.5, + "completions/mean_length": 496.421875, + "completions/mean_terminated_length": 483.1156311035156, + "completions/min_length": 199.75, + "completions/min_terminated_length": 199.75, + "epoch": 0.027, + "grad_norm": 0.5326834917068481, + "kl": 0.020416259765625, + "learning_rate": 5.300000000000001e-06, + "loss": 0.0688, + "num_tokens": 2276129.0, + "reward": 1.93359375, + "reward_std": 0.20822912454605103, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.14789126068353653, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.06403729319572449, + "step": 54, + "token_counts/after_target": 1786.25, + "token_counts/after_think": 332.0, + "token_counts/before_target": 1937.0, + "token_counts/before_think": 3887.5 + }, + { + "avg_penalty/after_target": 2.0182646214962006, + "avg_penalty/after_think": 3.706880033016205, + "avg_penalty/before_target": 0.3784146085381508, + "avg_penalty/before_think": 0.5939260348677635, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 827.25, + "completions/max_terminated_length": 778.0, + "completions/mean_length": 417.078125, + "completions/mean_terminated_length": 408.1885452270508, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.0275, + "grad_norm": 0.5133959054946899, + "kl": 0.023681640625, + "learning_rate": 5.400000000000001e-06, + "loss": 0.0689, + "num_tokens": 2312070.0, + "reward": 1.98046875, + "reward_std": 0.078125, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.015625, + "step": 55, + "token_counts/after_target": 539.0, + "token_counts/after_think": 448.0, + "token_counts/before_target": 832.0, + "token_counts/before_think": 4854.25 + }, + { + "avg_penalty/after_target": 2.099721908569336, + "avg_penalty/after_think": 3.5369834303855896, + "avg_penalty/before_target": 0.5649203285574913, + "avg_penalty/before_think": 0.6414124369621277, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 941.5, + "completions/max_terminated_length": 804.0, + "completions/mean_length": 444.46875, + "completions/mean_terminated_length": 414.49793243408203, + "completions/min_length": 175.75, + "completions/min_terminated_length": 175.75, + "epoch": 0.028, + "grad_norm": 0.881290853023529, + "kl": 0.022125244140625, + "learning_rate": 5.500000000000001e-06, + "loss": 0.2782, + "num_tokens": 2349044.0, + "reward": 1.91796875, + "reward_std": 0.328125, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.1875, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.140625, + "step": 56, + "token_counts/after_target": 1428.0, + "token_counts/after_think": 255.25, + "token_counts/before_target": 1786.25, + "token_counts/before_think": 3642.0 + }, + { + "avg_penalty/after_target": 2.5822675824165344, + "avg_penalty/after_think": 3.91267853975296, + "avg_penalty/before_target": 0.431605938822031, + "avg_penalty/before_think": 0.519464798271656, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 593.5, + "completions/max_terminated_length": 568.25, + "completions/mean_length": 323.953125, + "completions/mean_terminated_length": 315.6260452270508, + "completions/min_length": 150.5, + "completions/min_terminated_length": 150.5, + "epoch": 0.0285, + "grad_norm": 0.3820003867149353, + "kl": 0.028472900390625, + "learning_rate": 5.600000000000001e-06, + "loss": 0.0607, + "num_tokens": 2377953.0, + "reward": 1.97265625, + "reward_std": 0.109375, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.046875, + "step": 57, + "token_counts/after_target": 943.5, + "token_counts/after_think": 139.75, + "token_counts/before_target": 1313.5, + "token_counts/before_think": 2786.5 + }, + { + "avg_penalty/after_target": 2.113308370113373, + "avg_penalty/after_think": 3.956179201602936, + "avg_penalty/before_target": 0.5293030738830566, + "avg_penalty/before_think": 0.6002845987677574, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 756.0, + "completions/max_terminated_length": 756.0, + "completions/mean_length": 374.125, + "completions/mean_terminated_length": 374.125, + "completions/min_length": 143.25, + "completions/min_terminated_length": 143.25, + "epoch": 0.029, + "grad_norm": 0.41826435923576355, + "kl": 0.022613525390625, + "learning_rate": 5.7e-06, + "loss": -0.0162, + "num_tokens": 2410729.0, + "reward": 2.015625, + "reward_std": 0.0625, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 58, + "token_counts/after_target": 1274.0, + "token_counts/after_think": 330.25, + "token_counts/before_target": 1986.75, + "token_counts/before_think": 2395.0 + }, + { + "avg_penalty/after_target": 2.387886196374893, + "avg_penalty/after_think": 3.7940142154693604, + "avg_penalty/before_target": 0.31289907544851303, + "avg_penalty/before_think": 0.6091955900192261, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.75, + "completions/max_terminated_length": 559.75, + "completions/mean_length": 247.484375, + "completions/mean_terminated_length": 247.484375, + "completions/min_length": 119.5, + "completions/min_terminated_length": 119.5, + "epoch": 0.0295, + "grad_norm": 0.637260377407074, + "kl": 0.04632568359375, + "learning_rate": 5.8e-06, + "loss": 0.0303, + "num_tokens": 2434568.0, + "reward": 1.98046875, + "reward_std": 0.078125, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.015625, + "step": 59, + "token_counts/after_target": 540.0, + "token_counts/after_think": 132.0, + "token_counts/before_target": 1819.25, + "token_counts/before_think": 1468.5 + }, + { + "avg_penalty/after_target": 2.959477663040161, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3269215375185013, + "avg_penalty/before_think": 0.5259017869830132, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 260.578125, + "completions/mean_terminated_length": 260.578125, + "completions/min_length": 111.25, + "completions/min_terminated_length": 111.25, + "epoch": 0.03, + "grad_norm": 0.40239420533180237, + "kl": 0.038360595703125, + "learning_rate": 5.9e-06, + "loss": -0.0079, + "num_tokens": 2462637.0, + "reward": 2.046875, + "reward_std": 0.10077822208404541, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 60, + "token_counts/after_target": 618.5, + "token_counts/after_think": 163.25, + "token_counts/before_target": 1519.25, + "token_counts/before_think": 1868.25 + }, + { + "avg_penalty/after_target": 2.4327386021614075, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3409707322716713, + "avg_penalty/before_think": 0.6381470188498497, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.0, + "completions/max_terminated_length": 551.0, + "completions/mean_length": 339.640625, + "completions/mean_terminated_length": 339.640625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.0305, + "grad_norm": 0.4167066514492035, + "kl": 0.02764892578125, + "learning_rate": 6e-06, + "loss": 0.0024, + "num_tokens": 2494582.0, + "reward": 2.078125, + "reward_std": 0.11967839300632477, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.11967839300632477, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 61, + "token_counts/after_target": 890.0, + "token_counts/after_think": 175.0, + "token_counts/before_target": 2230.5, + "token_counts/before_think": 2138.75 + }, + { + "avg_penalty/after_target": 2.4609434008598328, + "avg_penalty/after_think": 3.83338326215744, + "avg_penalty/before_target": 0.3302932642400265, + "avg_penalty/before_think": 0.5984988957643509, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.25, + "completions/max_terminated_length": 540.25, + "completions/mean_length": 301.34375, + "completions/mean_terminated_length": 301.34375, + "completions/min_length": 159.25, + "completions/min_terminated_length": 159.25, + "epoch": 0.031, + "grad_norm": 0.5158855319023132, + "kl": 0.03466796875, + "learning_rate": 6.1e-06, + "loss": -0.0235, + "num_tokens": 2522572.0, + "reward": 2.046875, + "reward_std": 0.10077822208404541, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 62, + "token_counts/after_target": 683.75, + "token_counts/after_think": 128.5, + "token_counts/before_target": 2034.75, + "token_counts/before_think": 1974.5 + }, + { + "avg_penalty/after_target": 2.3586795926094055, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4270870238542557, + "avg_penalty/before_think": 0.5250855833292007, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.25, + "completions/max_terminated_length": 553.25, + "completions/mean_length": 323.8125, + "completions/mean_terminated_length": 323.8125, + "completions/min_length": 120.75, + "completions/min_terminated_length": 120.75, + "epoch": 0.0315, + "grad_norm": 0.10066765546798706, + "kl": 0.0372314453125, + "learning_rate": 6.200000000000001e-06, + "loss": 0.0015, + "num_tokens": 2555360.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 63, + "token_counts/after_target": 861.25, + "token_counts/after_think": 202.0, + "token_counts/before_target": 2156.25, + "token_counts/before_think": 1961.5 + }, + { + "avg_penalty/after_target": 2.709577977657318, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3838350176811218, + "avg_penalty/before_think": 0.5992186665534973, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.75, + "completions/max_terminated_length": 618.75, + "completions/mean_length": 344.703125, + "completions/mean_terminated_length": 344.703125, + "completions/min_length": 162.5, + "completions/min_terminated_length": 162.5, + "epoch": 0.032, + "grad_norm": 0.09925830364227295, + "kl": 0.04315185546875, + "learning_rate": 6.300000000000001e-06, + "loss": 0.0017, + "num_tokens": 2590189.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 64, + "token_counts/after_target": 859.75, + "token_counts/after_think": 245.0, + "token_counts/before_target": 1796.75, + "token_counts/before_think": 2613.75 + }, + { + "avg_penalty/after_target": 2.195526272058487, + "avg_penalty/after_think": 3.7654223442077637, + "avg_penalty/before_target": 0.33790165930986404, + "avg_penalty/before_think": 0.6072150021791458, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 626.25, + "completions/max_terminated_length": 626.25, + "completions/mean_length": 378.625, + "completions/mean_terminated_length": 378.625, + "completions/min_length": 145.75, + "completions/min_terminated_length": 145.75, + "epoch": 0.0325, + "grad_norm": 0.6029790639877319, + "kl": 0.03302001953125, + "learning_rate": 6.4000000000000006e-06, + "loss": -0.0166, + "num_tokens": 2625333.0, + "reward": 1.99609375, + "reward_std": 0.140625, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.015625, + "step": 65, + "token_counts/after_target": 674.0, + "token_counts/after_think": 509.25, + "token_counts/before_target": 1282.25, + "token_counts/before_think": 3592.5 + }, + { + "avg_penalty/after_target": 2.023210287094116, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.39667897298932076, + "avg_penalty/before_think": 0.5897247716784477, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 632.75, + "completions/max_terminated_length": 632.75, + "completions/mean_length": 350.125, + "completions/mean_terminated_length": 350.125, + "completions/min_length": 121.5, + "completions/min_terminated_length": 121.5, + "epoch": 0.033, + "grad_norm": 0.06873376667499542, + "kl": 0.0341796875, + "learning_rate": 6.5000000000000004e-06, + "loss": 0.0014, + "num_tokens": 2655565.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 66, + "token_counts/after_target": 802.25, + "token_counts/after_think": 275.75, + "token_counts/before_target": 1615.25, + "token_counts/before_think": 2908.75 + }, + { + "avg_penalty/after_target": 2.053039699792862, + "avg_penalty/after_think": 3.962563395500183, + "avg_penalty/before_target": 0.34105294942855835, + "avg_penalty/before_think": 0.7001773715019226, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 794.0, + "completions/max_terminated_length": 794.0, + "completions/mean_length": 499.875, + "completions/mean_terminated_length": 499.875, + "completions/min_length": 284.75, + "completions/min_terminated_length": 284.75, + "epoch": 0.0335, + "grad_norm": 0.061198603361845016, + "kl": 0.02685546875, + "learning_rate": 6.600000000000001e-06, + "loss": 0.0011, + "num_tokens": 2697461.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 67, + "token_counts/after_target": 1249.5, + "token_counts/after_think": 391.0, + "token_counts/before_target": 1499.25, + "token_counts/before_think": 4858.25 + }, + { + "avg_penalty/after_target": 1.906721979379654, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3612286075949669, + "avg_penalty/before_think": 0.5652895346283913, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.25, + "completions/max_terminated_length": 566.25, + "completions/mean_length": 371.875, + "completions/mean_terminated_length": 371.875, + "completions/min_length": 212.75, + "completions/min_terminated_length": 212.75, + "epoch": 0.034, + "grad_norm": 0.429094523191452, + "kl": 0.031646728515625, + "learning_rate": 6.700000000000001e-06, + "loss": -0.0013, + "num_tokens": 2730973.0, + "reward": 2.046875, + "reward_std": 0.10077822208404541, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 68, + "token_counts/after_target": 470.75, + "token_counts/after_think": 487.75, + "token_counts/before_target": 1252.25, + "token_counts/before_think": 3739.25 + }, + { + "avg_penalty/after_target": 2.162435293197632, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4089372381567955, + "avg_penalty/before_think": 0.6366221606731415, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 842.0, + "completions/max_terminated_length": 802.25, + "completions/mean_length": 477.5625, + "completions/mean_terminated_length": 470.1458435058594, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.0345, + "grad_norm": 0.5763460993766785, + "kl": 0.03131103515625, + "learning_rate": 6.800000000000001e-06, + "loss": 0.0495, + "num_tokens": 2771697.0, + "reward": 1.93359375, + "reward_std": 0.2161140739917755, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.14789126068353653, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.06822281517088413, + "step": 69, + "token_counts/after_target": 1072.75, + "token_counts/after_think": 686.5, + "token_counts/before_target": 1253.5, + "token_counts/before_think": 4628.25 + }, + { + "avg_penalty/after_target": 2.6000292897224426, + "avg_penalty/after_think": 3.81212317943573, + "avg_penalty/before_target": 0.5470425300300121, + "avg_penalty/before_think": 0.4677771329879761, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 688.0, + "completions/max_terminated_length": 620.0, + "completions/mean_length": 396.046875, + "completions/mean_terminated_length": 387.2822952270508, + "completions/min_length": 184.75, + "completions/min_terminated_length": 184.75, + "epoch": 0.035, + "grad_norm": 0.4193589687347412, + "kl": 0.03857421875, + "learning_rate": 6.9e-06, + "loss": 0.0675, + "num_tokens": 2808148.0, + "reward": 1.97265625, + "reward_std": 0.109375, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.046875, + "step": 70, + "token_counts/after_target": 703.5, + "token_counts/after_think": 602.5, + "token_counts/before_target": 963.75, + "token_counts/before_think": 4067.0 + }, + { + "avg_penalty/after_target": 1.7799786925315857, + "avg_penalty/after_think": 3.9186293482780457, + "avg_penalty/before_target": 0.3980545289814472, + "avg_penalty/before_think": 0.6256169676780701, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 626.25, + "completions/max_terminated_length": 626.25, + "completions/mean_length": 439.234375, + "completions/mean_terminated_length": 439.234375, + "completions/min_length": 219.25, + "completions/min_terminated_length": 219.25, + "epoch": 0.0355, + "grad_norm": 0.4408301115036011, + "kl": 0.03033447265625, + "learning_rate": 7e-06, + "loss": 0.0205, + "num_tokens": 2846787.0, + "reward": 2.015625, + "reward_std": 0.0625, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 71, + "token_counts/after_target": 717.0, + "token_counts/after_think": 605.75, + "token_counts/before_target": 1151.75, + "token_counts/before_think": 4553.25 + }, + { + "avg_penalty/after_target": 1.5361306071281433, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.49724867194890976, + "avg_penalty/before_think": 0.6683843433856964, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 746.75, + "completions/max_terminated_length": 746.75, + "completions/mean_length": 526.703125, + "completions/mean_terminated_length": 526.703125, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.036, + "grad_norm": 0.05798822268843651, + "kl": 0.035888671875, + "learning_rate": 7.100000000000001e-06, + "loss": 0.0014, + "num_tokens": 2888992.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 72, + "token_counts/after_target": 482.5, + "token_counts/after_think": 866.5, + "token_counts/before_target": 996.0, + "token_counts/before_think": 6082.25 + }, + { + "avg_penalty/after_target": 2.0143339335918427, + "avg_penalty/after_think": 3.921064555644989, + "avg_penalty/before_target": 0.562159575521946, + "avg_penalty/before_think": 0.6938900202512741, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 869.5, + "completions/max_terminated_length": 813.5, + "completions/mean_length": 585.4375, + "completions/mean_terminated_length": 560.8365478515625, + "completions/min_length": 317.5, + "completions/min_terminated_length": 317.5, + "epoch": 0.0365, + "grad_norm": 0.48966851830482483, + "kl": 0.028411865234375, + "learning_rate": 7.2000000000000005e-06, + "loss": 0.0775, + "num_tokens": 2938332.0, + "reward": 1.88671875, + "reward_std": 0.2752849608659744, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.18616948276758194, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.09798731282353401, + "step": 73, + "token_counts/after_target": 1462.25, + "token_counts/after_think": 709.5, + "token_counts/before_target": 1178.25, + "token_counts/before_think": 6017.0 + }, + { + "avg_penalty/after_target": 2.2775047421455383, + "avg_penalty/after_think": 3.837079882621765, + "avg_penalty/before_target": 0.41608142107725143, + "avg_penalty/before_think": 0.6281181275844574, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 790.5, + "completions/max_terminated_length": 790.5, + "completions/mean_length": 576.640625, + "completions/mean_terminated_length": 576.640625, + "completions/min_length": 361.75, + "completions/min_terminated_length": 361.75, + "epoch": 0.037, + "grad_norm": 0.4428955316543579, + "kl": 0.02783203125, + "learning_rate": 7.3e-06, + "loss": -0.0342, + "num_tokens": 2986149.0, + "reward": 2.09375, + "reward_std": 0.125, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.125, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 74, + "token_counts/after_target": 724.75, + "token_counts/after_think": 906.25, + "token_counts/before_target": 686.5, + "token_counts/before_think": 6908.75 + }, + { + "avg_penalty/after_target": 1.3169972896575928, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.47197000682353973, + "avg_penalty/before_think": 0.5950516909360886, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 787.5, + "completions/max_terminated_length": 787.5, + "completions/mean_length": 486.125, + "completions/mean_terminated_length": 486.125, + "completions/min_length": 244.25, + "completions/min_terminated_length": 244.25, + "epoch": 0.0375, + "grad_norm": 1.063776969909668, + "kl": 0.028564453125, + "learning_rate": 7.4e-06, + "loss": 0.0133, + "num_tokens": 3028589.0, + "reward": 2.109375, + "reward_std": 0.3509817495942116, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3221946656703949, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 75, + "token_counts/after_target": 528.5, + "token_counts/after_think": 748.0, + "token_counts/before_target": 948.0, + "token_counts/before_think": 5553.5 + }, + { + "avg_penalty/after_target": 1.8288001120090485, + "avg_penalty/after_think": 3.769866943359375, + "avg_penalty/before_target": 0.4244486838579178, + "avg_penalty/before_think": 0.5910222679376602, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 691.25, + "completions/max_terminated_length": 691.25, + "completions/mean_length": 430.6875, + "completions/mean_terminated_length": 430.6875, + "completions/min_length": 234.5, + "completions/min_terminated_length": 234.5, + "epoch": 0.038, + "grad_norm": 0.6818508505821228, + "kl": 0.03167724609375, + "learning_rate": 7.500000000000001e-06, + "loss": 0.0178, + "num_tokens": 3064313.0, + "reward": 2.078125, + "reward_std": 0.11967839300632477, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.11967839300632477, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 76, + "token_counts/after_target": 571.75, + "token_counts/after_think": 680.5, + "token_counts/before_target": 1354.25, + "token_counts/before_think": 4284.5 + }, + { + "avg_penalty/after_target": 2.7360042333602905, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.8189371973276138, + "avg_penalty/before_think": 0.6286966949701309, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 948.75, + "completions/max_terminated_length": 870.25, + "completions/mean_length": 589.078125, + "completions/mean_terminated_length": 563.9534454345703, + "completions/min_length": 245.5, + "completions/min_terminated_length": 245.5, + "epoch": 0.0385, + "grad_norm": 0.8423334956169128, + "kl": 0.027130126953125, + "learning_rate": 7.600000000000001e-06, + "loss": 0.0651, + "num_tokens": 3111214.0, + "reward": 1.9453125, + "reward_std": 0.44963234663009644, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.21039126068353653, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.1421622931957245, + "step": 77, + "token_counts/after_target": 1156.0, + "token_counts/after_think": 889.75, + "token_counts/before_target": 807.5, + "token_counts/before_think": 6572.0 + }, + { + "avg_penalty/after_target": 2.242621898651123, + "avg_penalty/after_think": 3.2627411484718323, + "avg_penalty/before_target": 0.5668349787592888, + "avg_penalty/before_think": 0.6466386467218399, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 878.25, + "completions/max_terminated_length": 823.25, + "completions/mean_length": 550.609375, + "completions/mean_terminated_length": 538.2552337646484, + "completions/min_length": 261.25, + "completions/min_terminated_length": 261.25, + "epoch": 0.039, + "grad_norm": 0.6599670648574829, + "kl": 0.027252197265625, + "learning_rate": 7.7e-06, + "loss": 0.0544, + "num_tokens": 3156405.0, + "reward": 2.01171875, + "reward_std": 0.3310198038816452, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.21039126068353653, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.078125, + "step": 78, + "token_counts/after_target": 1087.25, + "token_counts/after_think": 645.0, + "token_counts/before_target": 1109.0, + "token_counts/before_think": 5968.5 + }, + { + "avg_penalty/after_target": 1.7124580144882202, + "avg_penalty/after_think": 3.9553207755088806, + "avg_penalty/before_target": 0.3162553012371063, + "avg_penalty/before_think": 0.652285248041153, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 762.5, + "completions/max_terminated_length": 762.5, + "completions/mean_length": 529.1875, + "completions/mean_terminated_length": 529.1875, + "completions/min_length": 319.5, + "completions/min_terminated_length": 319.5, + "epoch": 0.0395, + "grad_norm": 0.364238977432251, + "kl": 0.028167724609375, + "learning_rate": 7.800000000000002e-06, + "loss": 0.0449, + "num_tokens": 3199649.0, + "reward": 2.09375, + "reward_std": 0.18217839300632477, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.18217839300632477, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 79, + "token_counts/after_target": 697.0, + "token_counts/after_think": 801.25, + "token_counts/before_target": 901.5, + "token_counts/before_think": 6067.25 + }, + { + "avg_penalty/after_target": 1.8945733904838562, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.6452879682183266, + "avg_penalty/before_think": 0.5797562003135681, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.25, + "completions/max_terminated_length": 744.25, + "completions/mean_length": 514.1875, + "completions/mean_terminated_length": 514.1875, + "completions/min_length": 316.5, + "completions/min_terminated_length": 316.5, + "epoch": 0.04, + "grad_norm": 0.4862828254699707, + "kl": 0.02532958984375, + "learning_rate": 7.9e-06, + "loss": 0.0078, + "num_tokens": 3241901.0, + "reward": 2.109375, + "reward_std": 0.1875, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.1875, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 80, + "token_counts/after_target": 662.5, + "token_counts/after_think": 695.75, + "token_counts/before_target": 559.75, + "token_counts/before_think": 6309.0 + }, + { + "avg_penalty/after_target": 2.571896553039551, + "avg_penalty/after_think": 2.876739263534546, + "avg_penalty/before_target": 0.7761808782815933, + "avg_penalty/before_think": 0.7153425961732864, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1018.0, + "completions/max_terminated_length": 981.0, + "completions/mean_length": 660.0, + "completions/mean_terminated_length": 633.5861511230469, + "completions/min_length": 284.75, + "completions/min_terminated_length": 284.75, + "epoch": 0.0405, + "grad_norm": 0.6715173721313477, + "kl": 0.029510498046875, + "learning_rate": 8.000000000000001e-06, + "loss": 0.1348, + "num_tokens": 3294253.0, + "reward": 1.8671875, + "reward_std": 0.37033721804618835, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.23680340498685837, + "rewards/tag_count_reward/mean": 0.9453125, + "rewards/tag_count_reward/std": 0.11510255187749863, + "step": 81, + "token_counts/after_target": 1844.5, + "token_counts/after_think": 834.25, + "token_counts/before_target": 1064.25, + "token_counts/before_think": 6817.0 + }, + { + "avg_penalty/after_target": 2.417271316051483, + "avg_penalty/after_think": 3.8729883432388306, + "avg_penalty/before_target": 0.4548543393611908, + "avg_penalty/before_think": 0.570919968187809, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 688.0, + "completions/max_terminated_length": 647.25, + "completions/mean_length": 448.65625, + "completions/mean_terminated_length": 439.8482208251953, + "completions/min_length": 217.5, + "completions/min_terminated_length": 217.5, + "epoch": 0.041, + "grad_norm": 0.3348902463912964, + "kl": 0.04168701171875, + "learning_rate": 8.1e-06, + "loss": 0.0215, + "num_tokens": 3336743.0, + "reward": 1.98046875, + "reward_std": 0.078125, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.015625, + "step": 82, + "token_counts/after_target": 828.25, + "token_counts/after_think": 337.5, + "token_counts/before_target": 533.25, + "token_counts/before_think": 5479.5 + }, + { + "avg_penalty/after_target": 1.6453037559986115, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.6676421537995338, + "avg_penalty/before_think": 0.5843763649463654, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 777.5, + "completions/max_terminated_length": 718.25, + "completions/mean_length": 522.03125, + "completions/mean_terminated_length": 515.6187591552734, + "completions/min_length": 195.5, + "completions/min_terminated_length": 195.5, + "epoch": 0.0415, + "grad_norm": 0.6681420803070068, + "kl": 0.035888671875, + "learning_rate": 8.2e-06, + "loss": 0.0261, + "num_tokens": 3379785.0, + "reward": 2.16796875, + "reward_std": 0.26030339300632477, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.18217839300632477, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.015625, + "step": 83, + "token_counts/after_target": 397.5, + "token_counts/after_think": 742.5, + "token_counts/before_target": 438.5, + "token_counts/before_think": 6774.0 + }, + { + "avg_penalty/after_target": 2.5297506153583527, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.6744520515203476, + "avg_penalty/before_think": 0.7284214645624161, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 867.5, + "completions/max_terminated_length": 855.0, + "completions/mean_length": 539.640625, + "completions/mean_terminated_length": 526.5446472167969, + "completions/min_length": 232.75, + "completions/min_terminated_length": 232.75, + "epoch": 0.042, + "grad_norm": 0.3679393529891968, + "kl": 0.033447265625, + "learning_rate": 8.3e-06, + "loss": 0.0524, + "num_tokens": 3426258.0, + "reward": 1.9453125, + "reward_std": 0.14943470060825348, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.08539126068353653, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.06404344737529755, + "step": 84, + "token_counts/after_target": 1710.75, + "token_counts/after_think": 717.0, + "token_counts/before_target": 1013.75, + "token_counts/before_think": 5192.75 + }, + { + "avg_penalty/after_target": 1.5966375768184662, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.32903778553009033, + "avg_penalty/before_think": 0.6421308666467667, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 685.25, + "completions/max_terminated_length": 685.25, + "completions/mean_length": 494.984375, + "completions/mean_terminated_length": 494.984375, + "completions/min_length": 243.75, + "completions/min_terminated_length": 243.75, + "epoch": 0.0425, + "grad_norm": 0.4612640142440796, + "kl": 0.0419921875, + "learning_rate": 8.400000000000001e-06, + "loss": -0.0155, + "num_tokens": 3466369.0, + "reward": 2.109375, + "reward_std": 0.2050696462392807, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.2050696536898613, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 85, + "token_counts/after_target": 526.5, + "token_counts/after_think": 821.25, + "token_counts/before_target": 1288.0, + "token_counts/before_think": 5284.0 + }, + { + "avg_penalty/after_target": 2.408757507801056, + "avg_penalty/after_think": 3.896373987197876, + "avg_penalty/before_target": 0.3696664609014988, + "avg_penalty/before_think": 0.6566064581274986, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.5, + "completions/max_terminated_length": 719.5, + "completions/mean_length": 446.328125, + "completions/mean_terminated_length": 446.328125, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.043, + "grad_norm": 0.45140090584754944, + "kl": 0.041748046875, + "learning_rate": 8.5e-06, + "loss": -0.0459, + "num_tokens": 3502422.0, + "reward": 2.1484375, + "reward_std": 0.20295128226280212, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.1905868947505951, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.03125, + "step": 86, + "token_counts/after_target": 1040.0, + "token_counts/after_think": 349.5, + "token_counts/before_target": 1554.75, + "token_counts/before_think": 4197.0 + }, + { + "avg_penalty/after_target": 1.9599685966968536, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.38279859721660614, + "avg_penalty/before_think": 0.6806656867265701, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 895.0, + "completions/max_terminated_length": 792.75, + "completions/mean_length": 490.71875, + "completions/mean_terminated_length": 480.61146545410156, + "completions/min_length": 182.25, + "completions/min_terminated_length": 182.25, + "epoch": 0.0435, + "grad_norm": 0.6182295083999634, + "kl": 0.035003662109375, + "learning_rate": 8.6e-06, + "loss": 0.1072, + "num_tokens": 3542116.0, + "reward": 1.96875, + "reward_std": 0.1877278834581375, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.14789126068353653, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.0625, + "step": 87, + "token_counts/after_target": 1107.0, + "token_counts/after_think": 463.5, + "token_counts/before_target": 1416.0, + "token_counts/before_think": 4865.0 + }, + { + "avg_penalty/after_target": 2.0441923439502716, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.468051016330719, + "avg_penalty/before_think": 0.6602441072463989, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 782.5, + "completions/max_terminated_length": 782.5, + "completions/mean_length": 521.078125, + "completions/mean_terminated_length": 521.078125, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.044, + "grad_norm": 0.3620375096797943, + "kl": 0.0357666015625, + "learning_rate": 8.700000000000001e-06, + "loss": -0.003, + "num_tokens": 3586057.0, + "reward": 2.015625, + "reward_std": 0.0625, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 88, + "token_counts/after_target": 1305.5, + "token_counts/after_think": 406.25, + "token_counts/before_target": 1825.0, + "token_counts/before_think": 4800.5 + }, + { + "avg_penalty/after_target": 1.987017959356308, + "avg_penalty/after_think": 3.95677649974823, + "avg_penalty/before_target": 0.39956825226545334, + "avg_penalty/before_think": 0.6414850354194641, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 831.0, + "completions/max_terminated_length": 831.0, + "completions/mean_length": 519.109375, + "completions/mean_terminated_length": 519.109375, + "completions/min_length": 263.5, + "completions/min_terminated_length": 263.5, + "epoch": 0.0445, + "grad_norm": 0.06656046211719513, + "kl": 0.0462646484375, + "learning_rate": 8.8e-06, + "loss": 0.0019, + "num_tokens": 3628736.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 89, + "token_counts/after_target": 1056.0, + "token_counts/after_think": 755.75, + "token_counts/before_target": 1481.75, + "token_counts/before_think": 5012.25 + }, + { + "avg_penalty/after_target": 1.909359186887741, + "avg_penalty/after_think": 3.987008571624756, + "avg_penalty/before_target": 0.47797761112451553, + "avg_penalty/before_think": 0.6317217126488686, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 708.75, + "completions/max_terminated_length": 631.75, + "completions/mean_length": 368.578125, + "completions/mean_terminated_length": 358.59896087646484, + "completions/min_length": 104.75, + "completions/min_terminated_length": 104.75, + "epoch": 0.045, + "grad_norm": 0.6666271686553955, + "kl": 0.04534912109375, + "learning_rate": 8.900000000000001e-06, + "loss": 0.11, + "num_tokens": 3660981.0, + "reward": 2.01171875, + "reward_std": 0.16351625323295593, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.015625, + "step": 90, + "token_counts/after_target": 918.75, + "token_counts/after_think": 194.0, + "token_counts/before_target": 2281.5, + "token_counts/before_think": 2503.0 + }, + { + "avg_penalty/after_target": 1.8183318078517914, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.31194495782256126, + "avg_penalty/before_think": 0.6287956684827805, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.5, + "completions/max_terminated_length": 622.5, + "completions/mean_length": 393.125, + "completions/mean_terminated_length": 393.125, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.0455, + "grad_norm": 0.07417535036802292, + "kl": 0.04986572265625, + "learning_rate": 9e-06, + "loss": 0.002, + "num_tokens": 3698269.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 91, + "token_counts/after_target": 652.25, + "token_counts/after_think": 506.0, + "token_counts/before_target": 1403.25, + "token_counts/before_think": 3728.5 + }, + { + "avg_penalty/after_target": 2.48397696018219, + "avg_penalty/after_think": 3.584916114807129, + "avg_penalty/before_target": 0.43334195762872696, + "avg_penalty/before_think": 0.5618027150630951, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 693.5, + "completions/max_terminated_length": 693.5, + "completions/mean_length": 449.4375, + "completions/mean_terminated_length": 449.4375, + "completions/min_length": 166.5, + "completions/min_terminated_length": 166.5, + "epoch": 0.046, + "grad_norm": 0.585016131401062, + "kl": 0.05023193359375, + "learning_rate": 9.100000000000001e-06, + "loss": -0.0106, + "num_tokens": 3738521.0, + "reward": 2.046875, + "reward_std": 0.14789125323295593, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.14789126068353653, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 92, + "token_counts/after_target": 1096.5, + "token_counts/after_think": 310.0, + "token_counts/before_target": 1832.75, + "token_counts/before_think": 3951.75 + }, + { + "avg_penalty/after_target": 2.200825333595276, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.48818066716194153, + "avg_penalty/before_think": 0.6161898523569107, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 719.25, + "completions/max_terminated_length": 683.0, + "completions/mean_length": 433.515625, + "completions/mean_terminated_length": 426.3208465576172, + "completions/min_length": 257.25, + "completions/min_terminated_length": 257.25, + "epoch": 0.0465, + "grad_norm": 0.7053835391998291, + "kl": 0.04156494140625, + "learning_rate": 9.200000000000002e-06, + "loss": 0.0398, + "num_tokens": 3774522.0, + "reward": 2.1015625, + "reward_std": 0.33822086453437805, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.23148179799318314, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.08539126068353653, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.021347815170884132, + "step": 93, + "token_counts/after_target": 883.0, + "token_counts/after_think": 354.25, + "token_counts/before_target": 1631.25, + "token_counts/before_think": 4067.75 + }, + { + "avg_penalty/after_target": 1.8788138031959534, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4462868422269821, + "avg_penalty/before_think": 0.6758236885070801, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 868.0, + "completions/max_terminated_length": 831.75, + "completions/mean_length": 615.875, + "completions/mean_terminated_length": 609.7270965576172, + "completions/min_length": 390.5, + "completions/min_terminated_length": 390.5, + "epoch": 0.047, + "grad_norm": 0.5234842300415039, + "kl": 0.035980224609375, + "learning_rate": 9.3e-06, + "loss": 0.0186, + "num_tokens": 3828386.0, + "reward": 1.99609375, + "reward_std": 0.140625, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.015625, + "step": 94, + "token_counts/after_target": 1066.0, + "token_counts/after_think": 850.0, + "token_counts/before_target": 1336.75, + "token_counts/before_think": 6601.25 + }, + { + "avg_penalty/after_target": 1.454824447631836, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.39262912422418594, + "avg_penalty/before_think": 0.6988170593976974, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 891.0, + "completions/max_terminated_length": 844.5, + "completions/mean_length": 557.015625, + "completions/mean_terminated_length": 549.4791717529297, + "completions/min_length": 309.75, + "completions/min_terminated_length": 309.75, + "epoch": 0.0475, + "grad_norm": 0.5646129250526428, + "kl": 0.04632568359375, + "learning_rate": 9.4e-06, + "loss": 0.0219, + "num_tokens": 3874691.0, + "reward": 2.0, + "reward_std": 0.27799537777900696, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.14789126068353653, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.08539126068353653, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.04841229319572449, + "step": 95, + "token_counts/after_target": 763.75, + "token_counts/after_think": 968.0, + "token_counts/before_target": 908.25, + "token_counts/before_think": 6272.25 + }, + { + "avg_penalty/after_target": 1.7797716557979584, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.519579604268074, + "avg_penalty/before_think": 0.6621666476130486, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 805.5, + "completions/max_terminated_length": 805.5, + "completions/mean_length": 505.125, + "completions/mean_terminated_length": 505.125, + "completions/min_length": 312.5, + "completions/min_terminated_length": 312.5, + "epoch": 0.048, + "grad_norm": 0.6518815159797668, + "kl": 0.041412353515625, + "learning_rate": 9.5e-06, + "loss": 0.0169, + "num_tokens": 3918651.0, + "reward": 2.125, + "reward_std": 0.21039125323295593, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.21039126068353653, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 96, + "token_counts/after_target": 908.75, + "token_counts/after_think": 742.5, + "token_counts/before_target": 886.75, + "token_counts/before_think": 5544.0 + }, + { + "avg_penalty/after_target": 1.3930826485157013, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.365626685321331, + "avg_penalty/before_think": 0.6702770888805389, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 842.5, + "completions/max_terminated_length": 842.5, + "completions/mean_length": 580.640625, + "completions/mean_terminated_length": 580.640625, + "completions/min_length": 374.25, + "completions/min_terminated_length": 374.25, + "epoch": 0.0485, + "grad_norm": 0.36519095301628113, + "kl": 0.05767822265625, + "learning_rate": 9.600000000000001e-06, + "loss": -0.012, + "num_tokens": 3964372.0, + "reward": 2.02734375, + "reward_std": 0.13495513796806335, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.015625, + "step": 97, + "token_counts/after_target": 524.75, + "token_counts/after_think": 791.0, + "token_counts/before_target": 508.5, + "token_counts/before_think": 7466.0 + }, + { + "avg_penalty/after_target": 1.7655830085277557, + "avg_penalty/after_think": 3.7877871990203857, + "avg_penalty/before_target": 0.45123688876628876, + "avg_penalty/before_think": 0.7044995874166489, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 949.25, + "completions/max_terminated_length": 888.75, + "completions/mean_length": 634.75, + "completions/mean_terminated_length": 617.1301422119141, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.049, + "grad_norm": 0.5198728442192078, + "kl": 0.0496826171875, + "learning_rate": 9.7e-06, + "loss": 0.0684, + "num_tokens": 4017252.0, + "reward": 1.9375, + "reward_std": 0.29429279267787933, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.17430340498685837, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.08004852384328842, + "step": 98, + "token_counts/after_target": 987.0, + "token_counts/after_think": 1025.0, + "token_counts/before_target": 521.75, + "token_counts/before_think": 7622.25 + }, + { + "avg_penalty/after_target": 1.612476110458374, + "avg_penalty/after_think": 3.901442050933838, + "avg_penalty/before_target": 0.3197822868824005, + "avg_penalty/before_think": 0.6765367835760117, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 882.0, + "completions/max_terminated_length": 852.0, + "completions/mean_length": 587.46875, + "completions/mean_terminated_length": 574.9642944335938, + "completions/min_length": 406.75, + "completions/min_terminated_length": 406.75, + "epoch": 0.0495, + "grad_norm": 0.45015138387680054, + "kl": 0.04510498046875, + "learning_rate": 9.800000000000001e-06, + "loss": 0.0635, + "num_tokens": 4066210.0, + "reward": 1.9921875, + "reward_std": 0.19213032722473145, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.08539126068353653, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.021347815170884132, + "step": 99, + "token_counts/after_target": 730.0, + "token_counts/after_think": 912.5, + "token_counts/before_target": 546.25, + "token_counts/before_think": 7210.75 + }, + { + "avg_penalty/after_target": 2.017862915992737, + "avg_penalty/after_think": 3.885262906551361, + "avg_penalty/before_target": 0.3555294945836067, + "avg_penalty/before_think": 0.6782254278659821, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 884.0, + "completions/max_terminated_length": 846.0, + "completions/mean_length": 662.0, + "completions/mean_terminated_length": 642.9531402587891, + "completions/min_length": 450.25, + "completions/min_terminated_length": 450.25, + "epoch": 0.05, + "grad_norm": 0.6141223907470703, + "kl": 0.04931640625, + "learning_rate": 9.9e-06, + "loss": 0.109, + "num_tokens": 4119186.0, + "reward": 2.0078125, + "reward_std": 0.2632553353905678, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.1971946656703949, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.17078252136707306, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.06976010836660862, + "step": 100, + "token_counts/after_target": 1167.0, + "token_counts/after_think": 1116.25, + "token_counts/before_target": 478.75, + "token_counts/before_think": 7830.0 + }, + { + "avg_penalty/after_target": 1.5823715329170227, + "avg_penalty/after_think": 3.939994692802429, + "avg_penalty/before_target": 0.47399451583623886, + "avg_penalty/before_think": 0.7031668424606323, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 889.75, + "completions/max_terminated_length": 831.25, + "completions/mean_length": 593.65625, + "completions/mean_terminated_length": 581.3739776611328, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.0505, + "grad_norm": 0.5045070052146912, + "kl": 0.0357666015625, + "learning_rate": 1e-05, + "loss": 0.0615, + "num_tokens": 4167612.0, + "reward": 1.94140625, + "reward_std": 0.1848640739917755, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.14789126068353653, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.03697281517088413, + "step": 101, + "token_counts/after_target": 1035.75, + "token_counts/after_think": 883.75, + "token_counts/before_target": 697.25, + "token_counts/before_think": 6881.75 + }, + { + "avg_penalty/after_target": 1.659296303987503, + "avg_penalty/after_think": 3.9829108119010925, + "avg_penalty/before_target": 0.43991997092962265, + "avg_penalty/before_think": 0.6884558126330376, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 831.25, + "completions/max_terminated_length": 831.25, + "completions/mean_length": 562.78125, + "completions/mean_terminated_length": 562.78125, + "completions/min_length": 357.75, + "completions/min_terminated_length": 357.75, + "epoch": 0.051, + "grad_norm": 0.625710129737854, + "kl": 0.05499267578125, + "learning_rate": 1.0100000000000002e-05, + "loss": 0.0036, + "num_tokens": 4212414.0, + "reward": 2.0859375, + "reward_std": 0.28534944355487823, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.12909944355487823, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.03125, + "step": 102, + "token_counts/after_target": 1152.0, + "token_counts/after_think": 955.0, + "token_counts/before_target": 787.0, + "token_counts/before_think": 6110.5 + }, + { + "avg_penalty/after_target": 1.5438488721847534, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3882431983947754, + "avg_penalty/before_think": 0.6491522192955017, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 795.0, + "completions/max_terminated_length": 795.0, + "completions/mean_length": 503.5, + "completions/mean_terminated_length": 503.5, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.0515, + "grad_norm": 0.3895302414894104, + "kl": 0.05181884765625, + "learning_rate": 1.02e-05, + "loss": 0.0002, + "num_tokens": 4253358.0, + "reward": 1.98046875, + "reward_std": 0.078125, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.015625, + "step": 103, + "token_counts/after_target": 574.75, + "token_counts/after_think": 739.5, + "token_counts/before_target": 660.75, + "token_counts/before_think": 6081.0 + }, + { + "avg_penalty/after_target": 1.6060185730457306, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.34231629967689514, + "avg_penalty/before_think": 0.7112623453140259, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 802.0, + "completions/max_terminated_length": 802.0, + "completions/mean_length": 563.046875, + "completions/mean_terminated_length": 563.046875, + "completions/min_length": 333.5, + "completions/min_terminated_length": 333.5, + "epoch": 0.052, + "grad_norm": 0.058229755610227585, + "kl": 0.04827880859375, + "learning_rate": 1.0300000000000001e-05, + "loss": 0.0019, + "num_tokens": 4299329.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 104, + "token_counts/after_target": 1052.25, + "token_counts/after_think": 796.5, + "token_counts/before_target": 845.5, + "token_counts/before_think": 6314.5 + }, + { + "avg_penalty/after_target": 1.9091512560844421, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.592461459338665, + "avg_penalty/before_think": 0.6648519411683083, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 873.5, + "completions/max_terminated_length": 856.0, + "completions/mean_length": 587.328125, + "completions/mean_terminated_length": 582.0823059082031, + "completions/min_length": 318.25, + "completions/min_terminated_length": 318.25, + "epoch": 0.0525, + "grad_norm": 0.7582993507385254, + "kl": 0.05096435546875, + "learning_rate": 1.04e-05, + "loss": 0.0404, + "num_tokens": 4350982.0, + "reward": 2.0625, + "reward_std": 0.3155868798494339, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.1280868947505951, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.0625, + "step": 105, + "token_counts/after_target": 900.5, + "token_counts/after_think": 920.5, + "token_counts/before_target": 486.25, + "token_counts/before_think": 7090.0 + }, + { + "avg_penalty/after_target": 2.244471460580826, + "avg_penalty/after_think": 3.9232614636421204, + "avg_penalty/before_target": 0.3482939228415489, + "avg_penalty/before_think": 0.5167969912290573, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.5, + "completions/max_terminated_length": 614.5, + "completions/mean_length": 380.296875, + "completions/mean_terminated_length": 380.296875, + "completions/min_length": 241.5, + "completions/min_terminated_length": 241.5, + "epoch": 0.053, + "grad_norm": 0.5787433385848999, + "kl": 0.062255859375, + "learning_rate": 1.0500000000000001e-05, + "loss": 0.0321, + "num_tokens": 4387481.0, + "reward": 2.125, + "reward_std": 0.22045661509037018, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.22045661509037018, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 106, + "token_counts/after_target": 448.25, + "token_counts/after_think": 498.0, + "token_counts/before_target": 538.0, + "token_counts/before_think": 4600.5 + }, + { + "avg_penalty/after_target": 1.6430258750915527, + "avg_penalty/after_think": 3.9458208680152893, + "avg_penalty/before_target": 0.34218254685401917, + "avg_penalty/before_think": 0.632422685623169, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 775.75, + "completions/max_terminated_length": 775.75, + "completions/mean_length": 479.421875, + "completions/mean_terminated_length": 479.421875, + "completions/min_length": 280.75, + "completions/min_terminated_length": 280.75, + "epoch": 0.0535, + "grad_norm": 0.46171554923057556, + "kl": 0.0513916015625, + "learning_rate": 1.0600000000000002e-05, + "loss": 0.0022, + "num_tokens": 4428548.0, + "reward": 1.98046875, + "reward_std": 0.078125, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.015625, + "step": 107, + "token_counts/after_target": 614.25, + "token_counts/after_think": 696.75, + "token_counts/before_target": 542.25, + "token_counts/before_think": 5817.5 + }, + { + "avg_penalty/after_target": 1.3472897708415985, + "avg_penalty/after_think": 3.928815007209778, + "avg_penalty/before_target": 0.42203543335199356, + "avg_penalty/before_think": 0.6156432926654816, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 754.75, + "completions/max_terminated_length": 754.75, + "completions/mean_length": 464.5, + "completions/mean_terminated_length": 464.5, + "completions/min_length": 226.5, + "completions/min_terminated_length": 226.5, + "epoch": 0.054, + "grad_norm": 0.9527857303619385, + "kl": 0.05828857421875, + "learning_rate": 1.0700000000000001e-05, + "loss": -0.0412, + "num_tokens": 4470516.0, + "reward": 2.09375, + "reward_std": 0.2561737596988678, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.2561737820506096, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 108, + "token_counts/after_target": 464.5, + "token_counts/after_think": 603.75, + "token_counts/before_target": 602.25, + "token_counts/before_think": 5761.5 + }, + { + "avg_penalty/after_target": 1.8588409423828125, + "avg_penalty/after_think": 3.924590528011322, + "avg_penalty/before_target": 0.3402659222483635, + "avg_penalty/before_think": 0.5641708225011826, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.75, + "completions/max_terminated_length": 647.75, + "completions/mean_length": 419.09375, + "completions/mean_terminated_length": 419.09375, + "completions/min_length": 154.5, + "completions/min_terminated_length": 154.5, + "epoch": 0.0545, + "grad_norm": 0.3512745797634125, + "kl": 0.06072998046875, + "learning_rate": 1.0800000000000002e-05, + "loss": 0.031, + "num_tokens": 4505482.0, + "reward": 1.98046875, + "reward_std": 0.078125, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.015625, + "step": 109, + "token_counts/after_target": 440.25, + "token_counts/after_think": 599.5, + "token_counts/before_target": 491.25, + "token_counts/before_think": 5174.5 + }, + { + "avg_penalty/after_target": 1.681037425994873, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3425459638237953, + "avg_penalty/before_think": 0.6365179419517517, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.25, + "completions/max_terminated_length": 707.25, + "completions/mean_length": 511.390625, + "completions/mean_terminated_length": 511.390625, + "completions/min_length": 293.75, + "completions/min_terminated_length": 293.75, + "epoch": 0.055, + "grad_norm": 0.07390904426574707, + "kl": 0.057861328125, + "learning_rate": 1.0900000000000002e-05, + "loss": 0.0023, + "num_tokens": 4548627.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 110, + "token_counts/after_target": 892.75, + "token_counts/after_think": 687.5, + "token_counts/before_target": 713.25, + "token_counts/before_think": 5888.75 + }, + { + "avg_penalty/after_target": 1.7165743708610535, + "avg_penalty/after_think": 3.8749056458473206, + "avg_penalty/before_target": 0.33362555503845215, + "avg_penalty/before_think": 0.5397411808371544, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 615.5, + "completions/max_terminated_length": 615.5, + "completions/mean_length": 381.25, + "completions/mean_terminated_length": 381.25, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.0555, + "grad_norm": 0.5548754334449768, + "kl": 0.06982421875, + "learning_rate": 1.1000000000000001e-05, + "loss": -0.0009, + "num_tokens": 4583427.0, + "reward": 2.125, + "reward_std": 0.1905868798494339, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.1905868947505951, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 111, + "token_counts/after_target": 468.0, + "token_counts/after_think": 547.75, + "token_counts/before_target": 455.75, + "token_counts/before_think": 4628.5 + }, + { + "avg_penalty/after_target": 1.3760262429714203, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3084510862827301, + "avg_penalty/before_think": 0.569564938545227, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 733.75, + "completions/max_terminated_length": 733.75, + "completions/mean_length": 475.3125, + "completions/mean_terminated_length": 475.3125, + "completions/min_length": 307.75, + "completions/min_terminated_length": 307.75, + "epoch": 0.056, + "grad_norm": 0.31378379464149475, + "kl": 0.06463623046875, + "learning_rate": 1.1100000000000002e-05, + "loss": -0.0021, + "num_tokens": 4622055.0, + "reward": 1.98046875, + "reward_std": 0.078125, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.015625, + "step": 112, + "token_counts/after_target": 360.0, + "token_counts/after_think": 725.5, + "token_counts/before_target": 375.5, + "token_counts/before_think": 6144.0 + }, + { + "avg_penalty/after_target": 1.5692787766456604, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5087653398513794, + "avg_penalty/before_think": 0.6551297903060913, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 802.75, + "completions/max_terminated_length": 774.75, + "completions/mean_length": 526.140625, + "completions/mean_terminated_length": 518.9573059082031, + "completions/min_length": 315.75, + "completions/min_terminated_length": 315.75, + "epoch": 0.0565, + "grad_norm": 0.6340245604515076, + "kl": 0.05731201171875, + "learning_rate": 1.1200000000000001e-05, + "loss": 0.0371, + "num_tokens": 4665280.0, + "reward": 1.96484375, + "reward_std": 0.21364577859640121, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.14789126068353653, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.06822281517088413, + "step": 113, + "token_counts/after_target": 914.75, + "token_counts/after_think": 842.25, + "token_counts/before_target": 799.5, + "token_counts/before_think": 5861.75 + }, + { + "avg_penalty/after_target": 1.4072176218032837, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.6774024069309235, + "avg_penalty/before_think": 0.7003274112939835, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 930.25, + "completions/max_terminated_length": 916.5, + "completions/mean_length": 610.796875, + "completions/mean_terminated_length": 594.0192108154297, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "epoch": 0.057, + "grad_norm": 0.6635134816169739, + "kl": 0.0589599609375, + "learning_rate": 1.13e-05, + "loss": 0.0683, + "num_tokens": 4714323.0, + "reward": 1.90625, + "reward_std": 0.317604124546051, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.21039126068353653, + "rewards/tag_count_reward/mean": 0.96875, + "rewards/tag_count_reward/std": 0.11091229319572449, + "step": 114, + "token_counts/after_target": 1147.5, + "token_counts/after_think": 1019.0, + "token_counts/before_target": 651.0, + "token_counts/before_think": 6955.25 + }, + { + "avg_penalty/after_target": 1.408301442861557, + "avg_penalty/after_think": 3.990747094154358, + "avg_penalty/before_target": 0.43282805383205414, + "avg_penalty/before_think": 0.6605774462223053, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 817.75, + "completions/max_terminated_length": 777.5, + "completions/mean_length": 547.328125, + "completions/mean_terminated_length": 542.2843780517578, + "completions/min_length": 318.75, + "completions/min_terminated_length": 318.75, + "epoch": 0.0575, + "grad_norm": 0.3900512158870697, + "kl": 0.05853271484375, + "learning_rate": 1.14e-05, + "loss": 0.0286, + "num_tokens": 4759640.0, + "reward": 1.98046875, + "reward_std": 0.078125, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.015625, + "step": 115, + "token_counts/after_target": 812.25, + "token_counts/after_think": 807.0, + "token_counts/before_target": 660.75, + "token_counts/before_think": 6477.25 + }, + { + "avg_penalty/after_target": 2.077223479747772, + "avg_penalty/after_think": 3.786979079246521, + "avg_penalty/before_target": 0.3406173214316368, + "avg_penalty/before_think": 0.6323474496603012, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 868.5, + "completions/max_terminated_length": 868.5, + "completions/mean_length": 578.796875, + "completions/mean_terminated_length": 578.796875, + "completions/min_length": 329.5, + "completions/min_terminated_length": 329.5, + "epoch": 0.058, + "grad_norm": 0.3901791572570801, + "kl": 0.06488037109375, + "learning_rate": 1.15e-05, + "loss": -0.0132, + "num_tokens": 4806059.0, + "reward": 2.046875, + "reward_std": 0.10077822208404541, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 116, + "token_counts/after_target": 442.5, + "token_counts/after_think": 1113.5, + "token_counts/before_target": 465.75, + "token_counts/before_think": 7239.0 + }, + { + "avg_penalty/after_target": 1.8262156546115875, + "avg_penalty/after_think": 3.9595565795898438, + "avg_penalty/before_target": 0.6979354470968246, + "avg_penalty/before_think": 0.7544626146554947, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 996.25, + "completions/max_terminated_length": 987.5, + "completions/mean_length": 716.234375, + "completions/mean_terminated_length": 703.4622192382812, + "completions/min_length": 354.75, + "completions/min_terminated_length": 354.75, + "epoch": 0.0585, + "grad_norm": 0.5565170645713806, + "kl": 0.0577392578125, + "learning_rate": 1.16e-05, + "loss": 0.0524, + "num_tokens": 4860874.0, + "reward": 1.93359375, + "reward_std": 0.2161140739917755, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.14789126068353653, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.06822281517088413, + "step": 117, + "token_counts/after_target": 1445.0, + "token_counts/after_think": 1144.75, + "token_counts/before_target": 672.0, + "token_counts/before_think": 8198.0 + }, + { + "avg_penalty/after_target": 1.5364782810211182, + "avg_penalty/after_think": 3.9313612580299377, + "avg_penalty/before_target": 0.34760668873786926, + "avg_penalty/before_think": 0.6740705817937851, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 883.5, + "completions/max_terminated_length": 831.75, + "completions/mean_length": 524.4375, + "completions/mean_terminated_length": 511.48126220703125, + "completions/min_length": 316.75, + "completions/min_terminated_length": 316.75, + "epoch": 0.059, + "grad_norm": 0.6509829759597778, + "kl": 0.0648193359375, + "learning_rate": 1.17e-05, + "loss": 0.0793, + "num_tokens": 4901606.0, + "reward": 1.9609375, + "reward_std": 0.15625, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.03125, + "step": 118, + "token_counts/after_target": 870.5, + "token_counts/after_think": 877.0, + "token_counts/before_target": 384.5, + "token_counts/before_think": 6259.0 + }, + { + "avg_penalty/after_target": 1.7030967772006989, + "avg_penalty/after_think": 3.9552891850471497, + "avg_penalty/before_target": 0.3619203492999077, + "avg_penalty/before_think": 0.5972295850515366, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 736.25, + "completions/max_terminated_length": 736.25, + "completions/mean_length": 450.109375, + "completions/mean_terminated_length": 450.109375, + "completions/min_length": 179.5, + "completions/min_terminated_length": 179.5, + "epoch": 0.0595, + "grad_norm": 0.5433244109153748, + "kl": 0.06549072265625, + "learning_rate": 1.18e-05, + "loss": -0.0007, + "num_tokens": 4939373.0, + "reward": 2.125, + "reward_std": 0.22360679507255554, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.22360680997371674, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 119, + "token_counts/after_target": 603.75, + "token_counts/after_think": 638.5, + "token_counts/before_target": 444.0, + "token_counts/before_think": 5515.5 + }, + { + "avg_penalty/after_target": 1.9741381406784058, + "avg_penalty/after_think": 3.908596396446228, + "avg_penalty/before_target": 0.4937697574496269, + "avg_penalty/before_think": 0.6846992373466492, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 846.75, + "completions/max_terminated_length": 846.75, + "completions/mean_length": 507.9375, + "completions/mean_terminated_length": 507.9375, + "completions/min_length": 259.25, + "completions/min_terminated_length": 259.25, + "epoch": 0.06, + "grad_norm": 0.06162354350090027, + "kl": 0.070068359375, + "learning_rate": 1.1900000000000001e-05, + "loss": 0.0028, + "num_tokens": 4980505.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 120, + "token_counts/after_target": 1078.75, + "token_counts/after_think": 644.75, + "token_counts/before_target": 471.5, + "token_counts/before_think": 5932.0 + }, + { + "avg_penalty/after_target": 1.5284562706947327, + "avg_penalty/after_think": 3.9642210602760315, + "avg_penalty/before_target": 0.4502527639269829, + "avg_penalty/before_think": 0.614184707403183, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 765.5, + "completions/max_terminated_length": 765.5, + "completions/mean_length": 484.3125, + "completions/mean_terminated_length": 484.3125, + "completions/min_length": 282.75, + "completions/min_terminated_length": 282.75, + "epoch": 0.0605, + "grad_norm": 0.5291619300842285, + "kl": 0.05523681640625, + "learning_rate": 1.2e-05, + "loss": 0.0035, + "num_tokens": 5019565.0, + "reward": 2.015625, + "reward_std": 0.0625, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 121, + "token_counts/after_target": 563.5, + "token_counts/after_think": 778.75, + "token_counts/before_target": 411.75, + "token_counts/before_think": 5995.0 + }, + { + "avg_penalty/after_target": 1.6053639650344849, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3214307278394699, + "avg_penalty/before_think": 0.6485062539577484, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 752.75, + "completions/max_terminated_length": 752.75, + "completions/mean_length": 457.6875, + "completions/mean_terminated_length": 457.6875, + "completions/min_length": 205.5, + "completions/min_terminated_length": 205.5, + "epoch": 0.061, + "grad_norm": 0.37864381074905396, + "kl": 0.06707763671875, + "learning_rate": 1.2100000000000001e-05, + "loss": -0.0061, + "num_tokens": 5057785.0, + "reward": 2.03125, + "reward_std": 0.08539125323295593, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 122, + "token_counts/after_target": 660.75, + "token_counts/after_think": 710.75, + "token_counts/before_target": 639.0, + "token_counts/before_think": 5312.5 + }, + { + "avg_penalty/after_target": 1.499027669429779, + "avg_penalty/after_think": 3.861760914325714, + "avg_penalty/before_target": 0.4989098533987999, + "avg_penalty/before_think": 0.697375014424324, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 808.5, + "completions/max_terminated_length": 745.75, + "completions/mean_length": 468.625, + "completions/mean_terminated_length": 454.54063415527344, + "completions/min_length": 170.75, + "completions/min_terminated_length": 170.75, + "epoch": 0.0615, + "grad_norm": 0.7003433108329773, + "kl": 0.06005859375, + "learning_rate": 1.22e-05, + "loss": 0.0908, + "num_tokens": 5098817.0, + "reward": 1.953125, + "reward_std": 0.1875, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.0625, + "step": 123, + "token_counts/after_target": 1171.5, + "token_counts/after_think": 678.0, + "token_counts/before_target": 1129.5, + "token_counts/before_think": 4519.0 + }, + { + "avg_penalty/after_target": 1.6986032724380493, + "avg_penalty/after_think": 3.8922412991523743, + "avg_penalty/before_target": 0.31508395075798035, + "avg_penalty/before_think": 0.5492830500006676, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.5, + "completions/max_terminated_length": 574.5, + "completions/mean_length": 380.953125, + "completions/mean_terminated_length": 380.953125, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.062, + "grad_norm": 0.35301098227500916, + "kl": 0.0751953125, + "learning_rate": 1.23e-05, + "loss": -0.0012, + "num_tokens": 5133022.0, + "reward": 2.0625, + "reward_std": 0.11180339753627777, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.11180340498685837, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 124, + "token_counts/after_target": 417.0, + "token_counts/after_think": 460.0, + "token_counts/before_target": 579.25, + "token_counts/before_think": 4639.0 + }, + { + "avg_penalty/after_target": 1.984910011291504, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.336493656039238, + "avg_penalty/before_think": 0.5939469859004021, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 739.75, + "completions/max_terminated_length": 644.75, + "completions/mean_length": 396.015625, + "completions/mean_terminated_length": 386.29896545410156, + "completions/min_length": 152.75, + "completions/min_terminated_length": 152.75, + "epoch": 0.0625, + "grad_norm": 0.6584233045578003, + "kl": 0.07086181640625, + "learning_rate": 1.2400000000000002e-05, + "loss": 0.0854, + "num_tokens": 5169695.0, + "reward": 1.98046875, + "reward_std": 0.078125, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.015625, + "step": 125, + "token_counts/after_target": 574.5, + "token_counts/after_think": 511.75, + "token_counts/before_target": 614.75, + "token_counts/before_think": 4635.25 + }, + { + "avg_penalty/after_target": 1.8155847191810608, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5029746070504189, + "avg_penalty/before_think": 0.6071944162249565, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 779.75, + "completions/max_terminated_length": 779.75, + "completions/mean_length": 412.1875, + "completions/mean_terminated_length": 412.1875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.063, + "grad_norm": 0.6914922595024109, + "kl": 0.06964111328125, + "learning_rate": 1.25e-05, + "loss": 0.0098, + "num_tokens": 5207883.0, + "reward": 2.15234375, + "reward_std": 0.13103902339935303, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.11967839300632477, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.015625, + "step": 126, + "token_counts/after_target": 745.5, + "token_counts/after_think": 521.75, + "token_counts/before_target": 1351.75, + "token_counts/before_think": 3976.0 + }, + { + "avg_penalty/after_target": 2.688849300146103, + "avg_penalty/after_think": 3.887681245803833, + "avg_penalty/before_target": 0.2999168187379837, + "avg_penalty/before_think": 0.5968490168452263, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 624.5, + "completions/max_terminated_length": 624.5, + "completions/mean_length": 349.375, + "completions/mean_terminated_length": 349.375, + "completions/min_length": 93.75, + "completions/min_terminated_length": 93.75, + "epoch": 0.0635, + "grad_norm": 0.06372141093015671, + "kl": 0.07049560546875, + "learning_rate": 1.2600000000000001e-05, + "loss": 0.0028, + "num_tokens": 5239939.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 127, + "token_counts/after_target": 506.75, + "token_counts/after_think": 444.5, + "token_counts/before_target": 891.5, + "token_counts/before_think": 3747.25 + }, + { + "avg_penalty/after_target": 1.9205500483512878, + "avg_penalty/after_think": 3.8002288341522217, + "avg_penalty/before_target": 0.5476045832037926, + "avg_penalty/before_think": 0.6350259035825729, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 759.25, + "completions/max_terminated_length": 735.5, + "completions/mean_length": 439.265625, + "completions/mean_terminated_length": 414.6382293701172, + "completions/min_length": 116.25, + "completions/min_terminated_length": 116.25, + "epoch": 0.064, + "grad_norm": 0.6913912892341614, + "kl": 0.087158203125, + "learning_rate": 1.27e-05, + "loss": 0.0472, + "num_tokens": 5277828.0, + "reward": 2.06640625, + "reward_std": 0.2898138016462326, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.1280868947505951, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.10077822208404541, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.06442352384328842, + "step": 128, + "token_counts/after_target": 823.25, + "token_counts/after_think": 446.25, + "token_counts/before_target": 1250.5, + "token_counts/before_think": 4508.25 + }, + { + "avg_penalty/after_target": 2.508962094783783, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4639400951564312, + "avg_penalty/before_think": 0.6272765696048737, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.5, + "completions/max_terminated_length": 789.5, + "completions/mean_length": 416.109375, + "completions/mean_terminated_length": 416.109375, + "completions/min_length": 122.75, + "completions/min_terminated_length": 122.75, + "epoch": 0.0645, + "grad_norm": 0.078030526638031, + "kl": 0.06317138671875, + "learning_rate": 1.2800000000000001e-05, + "loss": 0.0025, + "num_tokens": 5313499.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 129, + "token_counts/after_target": 789.75, + "token_counts/after_think": 429.75, + "token_counts/before_target": 1780.0, + "token_counts/before_think": 3658.25 + }, + { + "avg_penalty/after_target": 2.0986780524253845, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5752614736557007, + "avg_penalty/before_think": 0.6404348164796829, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 814.25, + "completions/max_terminated_length": 814.25, + "completions/mean_length": 473.1875, + "completions/mean_terminated_length": 473.1875, + "completions/min_length": 225.75, + "completions/min_terminated_length": 225.75, + "epoch": 0.065, + "grad_norm": 0.18532074987888336, + "kl": 0.08367919921875, + "learning_rate": 1.2900000000000002e-05, + "loss": 0.0033, + "num_tokens": 5353719.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 130, + "token_counts/after_target": 1468.75, + "token_counts/after_think": 508.0, + "token_counts/before_target": 2038.75, + "token_counts/before_think": 3555.5 + }, + { + "avg_penalty/after_target": 2.209033250808716, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4732990711927414, + "avg_penalty/before_think": 0.6401621848344803, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 729.75, + "completions/max_terminated_length": 729.75, + "completions/mean_length": 451.0625, + "completions/mean_terminated_length": 451.0625, + "completions/min_length": 128.5, + "completions/min_terminated_length": 128.5, + "epoch": 0.0655, + "grad_norm": 0.06302894651889801, + "kl": 0.074462890625, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.003, + "num_tokens": 5393259.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 131, + "token_counts/after_target": 1118.75, + "token_counts/after_think": 636.75, + "token_counts/before_target": 1707.0, + "token_counts/before_think": 3754.5 + }, + { + "avg_penalty/after_target": 1.8152810335159302, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3104119524359703, + "avg_penalty/before_think": 0.6258042454719543, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.25, + "completions/max_terminated_length": 813.25, + "completions/mean_length": 407.515625, + "completions/mean_terminated_length": 407.515625, + "completions/min_length": 90.25, + "completions/min_terminated_length": 90.25, + "epoch": 0.066, + "grad_norm": 0.08303914964199066, + "kl": 0.091552734375, + "learning_rate": 1.3100000000000002e-05, + "loss": 0.0037, + "num_tokens": 5427948.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 132, + "token_counts/after_target": 490.0, + "token_counts/after_think": 469.5, + "token_counts/before_target": 1107.5, + "token_counts/before_think": 4453.25 + }, + { + "avg_penalty/after_target": 2.314144730567932, + "avg_penalty/after_think": 3.8882634043693542, + "avg_penalty/before_target": 0.4314361736178398, + "avg_penalty/before_think": 0.611474834382534, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.25, + "completions/max_terminated_length": 719.25, + "completions/mean_length": 369.796875, + "completions/mean_terminated_length": 369.796875, + "completions/min_length": 113.75, + "completions/min_terminated_length": 113.75, + "epoch": 0.0665, + "grad_norm": 0.07467034459114075, + "kl": 0.0931396484375, + "learning_rate": 1.3200000000000002e-05, + "loss": 0.0037, + "num_tokens": 5462175.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 133, + "token_counts/after_target": 895.75, + "token_counts/after_think": 142.25, + "token_counts/before_target": 1272.25, + "token_counts/before_think": 3606.5 + }, + { + "avg_penalty/after_target": 2.2685275971889496, + "avg_penalty/after_think": 3.3295714259147644, + "avg_penalty/before_target": 0.5217575952410698, + "avg_penalty/before_think": 0.6803206354379654, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 778.0, + "completions/max_terminated_length": 778.0, + "completions/mean_length": 476.921875, + "completions/mean_terminated_length": 476.921875, + "completions/min_length": 159.25, + "completions/min_terminated_length": 159.25, + "epoch": 0.067, + "grad_norm": 0.5003976821899414, + "kl": 0.095947265625, + "learning_rate": 1.3300000000000001e-05, + "loss": -0.029, + "num_tokens": 5501530.0, + "reward": 2.03125, + "reward_std": 0.08539125323295593, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 134, + "token_counts/after_target": 1562.25, + "token_counts/after_think": 244.75, + "token_counts/before_target": 2316.75, + "token_counts/before_think": 3507.0 + }, + { + "avg_penalty/after_target": 1.7518301904201508, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.7010683715343475, + "avg_penalty/before_think": 0.6801691949367523, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 924.5, + "completions/max_terminated_length": 795.5, + "completions/mean_length": 517.90625, + "completions/mean_terminated_length": 499.9458465576172, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.0675, + "grad_norm": 0.6919293403625488, + "kl": 0.0838623046875, + "learning_rate": 1.3400000000000002e-05, + "loss": 0.1337, + "num_tokens": 5546548.0, + "reward": 2.0, + "reward_std": 0.24433013796806335, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.0625, + "step": 135, + "token_counts/after_target": 1762.75, + "token_counts/after_think": 673.25, + "token_counts/before_target": 2322.5, + "token_counts/before_think": 3528.0 + }, + { + "avg_penalty/after_target": 1.7176029086112976, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5790595933794975, + "avg_penalty/before_think": 0.6181239038705826, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 737.75, + "completions/max_terminated_length": 737.75, + "completions/mean_length": 463.296875, + "completions/mean_terminated_length": 463.296875, + "completions/min_length": 128.75, + "completions/min_terminated_length": 128.75, + "epoch": 0.068, + "grad_norm": 0.5417434573173523, + "kl": 0.0850830078125, + "learning_rate": 1.3500000000000001e-05, + "loss": 0.0202, + "num_tokens": 5585335.0, + "reward": 2.046875, + "reward_std": 0.14789125323295593, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.14789126068353653, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 136, + "token_counts/after_target": 760.5, + "token_counts/after_think": 559.0, + "token_counts/before_target": 1554.25, + "token_counts/before_think": 4539.0 + }, + { + "avg_penalty/after_target": 2.0537874698638916, + "avg_penalty/after_think": 3.987931251525879, + "avg_penalty/before_target": 0.5478226616978645, + "avg_penalty/before_think": 0.7617126852273941, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 855.25, + "completions/max_terminated_length": 855.25, + "completions/mean_length": 558.53125, + "completions/mean_terminated_length": 558.53125, + "completions/min_length": 220.5, + "completions/min_terminated_length": 220.5, + "epoch": 0.0685, + "grad_norm": 0.07029804587364197, + "kl": 0.085205078125, + "learning_rate": 1.3600000000000002e-05, + "loss": 0.0034, + "num_tokens": 5631625.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 137, + "token_counts/after_target": 1727.5, + "token_counts/after_think": 364.75, + "token_counts/before_target": 2395.25, + "token_counts/before_think": 4449.0 + }, + { + "avg_penalty/after_target": 1.8642631471157074, + "avg_penalty/after_think": 3.9419026374816895, + "avg_penalty/before_target": 0.5547195449471474, + "avg_penalty/before_think": 0.62205820530653, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 825.75, + "completions/max_terminated_length": 811.25, + "completions/mean_length": 577.546875, + "completions/mean_terminated_length": 561.5300598144531, + "completions/min_length": 276.75, + "completions/min_terminated_length": 276.75, + "epoch": 0.069, + "grad_norm": 0.4180189371109009, + "kl": 0.08203125, + "learning_rate": 1.3700000000000003e-05, + "loss": 0.0122, + "num_tokens": 5680396.0, + "reward": 1.9609375, + "reward_std": 0.27273833751678467, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.1632782220840454, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.06520001962780952, + "step": 138, + "token_counts/after_target": 1048.75, + "token_counts/after_think": 831.0, + "token_counts/before_target": 1020.5, + "token_counts/before_think": 6340.5 + }, + { + "avg_penalty/after_target": 2.3241087198257446, + "avg_penalty/after_think": 3.951620042324066, + "avg_penalty/before_target": 0.7433303743600845, + "avg_penalty/before_think": 0.73981773853302, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 951.25, + "completions/max_terminated_length": 941.75, + "completions/mean_length": 652.828125, + "completions/mean_terminated_length": 648.7698059082031, + "completions/min_length": 322.75, + "completions/min_terminated_length": 322.75, + "epoch": 0.0695, + "grad_norm": 0.3699907958507538, + "kl": 0.07672119140625, + "learning_rate": 1.38e-05, + "loss": 0.0226, + "num_tokens": 5732961.0, + "reward": 1.98046875, + "reward_std": 0.078125, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.015625, + "step": 139, + "token_counts/after_target": 2213.0, + "token_counts/after_think": 576.0, + "token_counts/before_target": 1903.0, + "token_counts/before_think": 5753.25 + }, + { + "avg_penalty/after_target": 1.8971133828163147, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5950041674077511, + "avg_penalty/before_think": 0.7632802128791809, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 940.0, + "completions/max_terminated_length": 938.25, + "completions/mean_length": 639.4375, + "completions/mean_terminated_length": 630.8928680419922, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.07, + "grad_norm": 0.2851007282733917, + "kl": 0.07861328125, + "learning_rate": 1.39e-05, + "loss": 0.0306, + "num_tokens": 5784861.0, + "reward": 1.953125, + "reward_std": 0.13010412454605103, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.08539126068353653, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.04841229319572449, + "step": 140, + "token_counts/after_target": 1687.0, + "token_counts/after_think": 915.5, + "token_counts/before_target": 1953.75, + "token_counts/before_think": 5674.75 + }, + { + "avg_penalty/after_target": 2.022271752357483, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4688657596707344, + "avg_penalty/before_think": 0.5873059928417206, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.25, + "completions/max_terminated_length": 707.25, + "completions/mean_length": 468.640625, + "completions/mean_terminated_length": 468.640625, + "completions/min_length": 258.75, + "completions/min_terminated_length": 258.75, + "epoch": 0.0705, + "grad_norm": 0.47587206959724426, + "kl": 0.0999755859375, + "learning_rate": 1.4e-05, + "loss": 0.0281, + "num_tokens": 5822742.0, + "reward": 2.046875, + "reward_std": 0.10077822208404541, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 141, + "token_counts/after_target": 985.25, + "token_counts/after_think": 595.75, + "token_counts/before_target": 1797.75, + "token_counts/before_think": 4119.5 + }, + { + "avg_penalty/after_target": 2.412348449230194, + "avg_penalty/after_think": 3.522856295108795, + "avg_penalty/before_target": 0.7685453146696091, + "avg_penalty/before_think": 0.7004053145647049, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 908.25, + "completions/max_terminated_length": 856.25, + "completions/mean_length": 635.046875, + "completions/mean_terminated_length": 628.3750152587891, + "completions/min_length": 229.75, + "completions/min_terminated_length": 229.75, + "epoch": 0.071, + "grad_norm": 0.4974897801876068, + "kl": 0.083984375, + "learning_rate": 1.41e-05, + "loss": 0.0209, + "num_tokens": 5872745.0, + "reward": 1.98828125, + "reward_std": 0.171875, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.046875, + "step": 142, + "token_counts/after_target": 2354.75, + "token_counts/after_think": 525.0, + "token_counts/before_target": 1870.0, + "token_counts/before_think": 5411.0 + }, + { + "avg_penalty/after_target": 2.2504879534244537, + "avg_penalty/after_think": 3.972554385662079, + "avg_penalty/before_target": 0.594853013753891, + "avg_penalty/before_think": 0.6931700110435486, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 917.75, + "completions/max_terminated_length": 838.25, + "completions/mean_length": 556.59375, + "completions/mean_terminated_length": 535.9712829589844, + "completions/min_length": 274.25, + "completions/min_terminated_length": 274.25, + "epoch": 0.0715, + "grad_norm": 0.6445760726928711, + "kl": 0.08837890625, + "learning_rate": 1.4200000000000001e-05, + "loss": 0.1005, + "num_tokens": 5917503.0, + "reward": 1.92578125, + "reward_std": 0.22755970060825348, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.14789126068353653, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.07966844737529755, + "step": 143, + "token_counts/after_target": 1567.0, + "token_counts/after_think": 647.25, + "token_counts/before_target": 1676.5, + "token_counts/before_think": 5014.75 + }, + { + "avg_penalty/after_target": 2.2171575129032135, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.6012844294309616, + "avg_penalty/before_think": 0.6303495764732361, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 754.25, + "completions/max_terminated_length": 754.25, + "completions/mean_length": 472.484375, + "completions/mean_terminated_length": 472.484375, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.072, + "grad_norm": 0.13563261926174164, + "kl": 0.087646484375, + "learning_rate": 1.43e-05, + "loss": 0.0035, + "num_tokens": 5956734.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 144, + "token_counts/after_target": 1096.5, + "token_counts/after_think": 652.75, + "token_counts/before_target": 1856.5, + "token_counts/before_think": 3954.0 + }, + { + "avg_penalty/after_target": 1.9445462226867676, + "avg_penalty/after_think": 3.995429217815399, + "avg_penalty/before_target": 0.49935996532440186, + "avg_penalty/before_think": 0.6021382957696915, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 743.75, + "completions/max_terminated_length": 723.75, + "completions/mean_length": 511.03125, + "completions/mean_terminated_length": 505.65521240234375, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "epoch": 0.0725, + "grad_norm": 0.5692989826202393, + "kl": 0.08935546875, + "learning_rate": 1.4400000000000001e-05, + "loss": 0.0178, + "num_tokens": 5999216.0, + "reward": 2.0078125, + "reward_std": 0.2667674422264099, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.14789126068353653, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.03125, + "step": 145, + "token_counts/after_target": 959.0, + "token_counts/after_think": 803.25, + "token_counts/before_target": 1297.5, + "token_counts/before_think": 5116.75 + }, + { + "avg_penalty/after_target": 2.6841870546340942, + "avg_penalty/after_think": 3.942953944206238, + "avg_penalty/before_target": 0.4175557494163513, + "avg_penalty/before_think": 0.5537527576088905, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.5, + "completions/max_terminated_length": 647.5, + "completions/mean_length": 420.953125, + "completions/mean_terminated_length": 420.953125, + "completions/min_length": 160.5, + "completions/min_terminated_length": 160.5, + "epoch": 0.073, + "grad_norm": 0.5906317830085754, + "kl": 0.0931396484375, + "learning_rate": 1.45e-05, + "loss": 0.0382, + "num_tokens": 6034925.0, + "reward": 1.98046875, + "reward_std": 0.078125, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.015625, + "step": 146, + "token_counts/after_target": 836.0, + "token_counts/after_think": 353.25, + "token_counts/before_target": 1157.75, + "token_counts/before_think": 4388.25 + }, + { + "avg_penalty/after_target": 1.9659163355827332, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.6056665182113647, + "avg_penalty/before_think": 0.6554778665304184, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 723.25, + "completions/max_terminated_length": 723.25, + "completions/mean_length": 463.78125, + "completions/mean_terminated_length": 463.78125, + "completions/min_length": 221.25, + "completions/min_terminated_length": 221.25, + "epoch": 0.0735, + "grad_norm": 0.4516794979572296, + "kl": 0.092041015625, + "learning_rate": 1.46e-05, + "loss": 0.0202, + "num_tokens": 6074079.0, + "reward": 1.99609375, + "reward_std": 0.015625, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.015625, + "step": 147, + "token_counts/after_target": 1394.0, + "token_counts/after_think": 338.25, + "token_counts/before_target": 1606.25, + "token_counts/before_think": 4082.0 + }, + { + "avg_penalty/after_target": 2.0162221789360046, + "avg_penalty/after_think": 3.937796711921692, + "avg_penalty/before_target": 0.489267960190773, + "avg_penalty/before_think": 0.6496924012899399, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 761.75, + "completions/max_terminated_length": 761.75, + "completions/mean_length": 503.1875, + "completions/mean_terminated_length": 503.1875, + "completions/min_length": 167.25, + "completions/min_terminated_length": 167.25, + "epoch": 0.074, + "grad_norm": 0.44746673107147217, + "kl": 0.102783203125, + "learning_rate": 1.4700000000000002e-05, + "loss": -0.0061, + "num_tokens": 6117851.0, + "reward": 2.03125, + "reward_std": 0.08539125323295593, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 148, + "token_counts/after_target": 1106.25, + "token_counts/after_think": 625.0, + "token_counts/before_target": 1556.25, + "token_counts/before_think": 4763.5 + }, + { + "avg_penalty/after_target": 1.9851580262184143, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.40670495480298996, + "avg_penalty/before_think": 0.5877553969621658, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 630.0, + "completions/max_terminated_length": 630.0, + "completions/mean_length": 445.3125, + "completions/mean_terminated_length": 445.3125, + "completions/min_length": 177.75, + "completions/min_terminated_length": 177.75, + "epoch": 0.0745, + "grad_norm": 0.08703107386827469, + "kl": 0.11083984375, + "learning_rate": 1.48e-05, + "loss": 0.0044, + "num_tokens": 6155391.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 149, + "token_counts/after_target": 941.25, + "token_counts/after_think": 560.25, + "token_counts/before_target": 1462.75, + "token_counts/before_think": 4160.75 + }, + { + "avg_penalty/after_target": 2.8791965544223785, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.35743849724531174, + "avg_penalty/before_think": 0.5127094686031342, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.5, + "completions/max_terminated_length": 521.5, + "completions/mean_length": 306.640625, + "completions/mean_terminated_length": 306.640625, + "completions/min_length": 126.5, + "completions/min_terminated_length": 126.5, + "epoch": 0.075, + "grad_norm": 0.48224157094955444, + "kl": 0.121826171875, + "learning_rate": 1.4900000000000001e-05, + "loss": 0.0202, + "num_tokens": 6183960.0, + "reward": 1.98046875, + "reward_std": 0.078125, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.015625, + "step": 150, + "token_counts/after_target": 431.0, + "token_counts/after_think": 319.75, + "token_counts/before_target": 1121.5, + "token_counts/before_think": 3034.0 + }, + { + "avg_penalty/after_target": 1.8648143112659454, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5525211170315742, + "avg_penalty/before_think": 0.616817906498909, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.25, + "completions/max_terminated_length": 687.25, + "completions/mean_length": 349.875, + "completions/mean_terminated_length": 349.875, + "completions/min_length": 128.25, + "completions/min_terminated_length": 128.25, + "epoch": 0.0755, + "grad_norm": 0.3499680757522583, + "kl": 0.1099853515625, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.0044, + "num_tokens": 6214000.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 151, + "token_counts/after_target": 801.25, + "token_counts/after_think": 443.5, + "token_counts/before_target": 1273.0, + "token_counts/before_think": 3080.25 + }, + { + "avg_penalty/after_target": 2.389873683452606, + "avg_penalty/after_think": 3.7614088654518127, + "avg_penalty/before_target": 0.728045716881752, + "avg_penalty/before_think": 0.6791111379861832, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 840.25, + "completions/max_terminated_length": 722.5, + "completions/mean_length": 480.703125, + "completions/mean_terminated_length": 437.8047790527344, + "completions/min_length": 190.5, + "completions/min_terminated_length": 190.5, + "epoch": 0.076, + "grad_norm": 0.831678569316864, + "kl": 0.1160888671875, + "learning_rate": 1.5100000000000001e-05, + "loss": 0.151, + "num_tokens": 6255245.0, + "reward": 1.703125, + "reward_std": 0.6522496119141579, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.11180340498685837, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4057852029800415, + "rewards/tag_count_reward/mean": 0.859375, + "rewards/tag_count_reward/std": 0.2283649630844593, + "step": 152, + "token_counts/after_target": 1877.25, + "token_counts/after_think": 236.25, + "token_counts/before_target": 2014.25, + "token_counts/before_think": 3563.5 + }, + { + "avg_penalty/after_target": 2.5384442508220673, + "avg_penalty/after_think": 3.907413959503174, + "avg_penalty/before_target": 0.26085513085126877, + "avg_penalty/before_think": 0.5452110543847084, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.25, + "completions/max_terminated_length": 631.25, + "completions/mean_length": 407.859375, + "completions/mean_terminated_length": 407.859375, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.0765, + "grad_norm": 0.45929133892059326, + "kl": 0.15234375, + "learning_rate": 1.5200000000000002e-05, + "loss": -0.0103, + "num_tokens": 6291556.0, + "reward": 2.03125, + "reward_std": 0.08539125323295593, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 153, + "token_counts/after_target": 594.0, + "token_counts/after_think": 402.0, + "token_counts/before_target": 1437.75, + "token_counts/before_think": 4092.0 + }, + { + "avg_penalty/after_target": 2.6515265703201294, + "avg_penalty/after_think": 3.9284502267837524, + "avg_penalty/before_target": 0.29213934764266014, + "avg_penalty/before_think": 0.497448094189167, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 552.25, + "completions/max_terminated_length": 552.25, + "completions/mean_length": 297.453125, + "completions/mean_terminated_length": 297.453125, + "completions/min_length": 110.75, + "completions/min_terminated_length": 110.75, + "epoch": 0.077, + "grad_norm": 0.15288394689559937, + "kl": 0.178466796875, + "learning_rate": 1.5300000000000003e-05, + "loss": 0.0071, + "num_tokens": 6318801.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 154, + "token_counts/after_target": 434.75, + "token_counts/after_think": 202.75, + "token_counts/before_target": 1304.0, + "token_counts/before_think": 2817.75 + }, + { + "avg_penalty/after_target": 2.690061181783676, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.2934698276221752, + "avg_penalty/before_think": 0.5594557076692581, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 664.25, + "completions/max_terminated_length": 664.25, + "completions/mean_length": 387.578125, + "completions/mean_terminated_length": 387.578125, + "completions/min_length": 183.25, + "completions/min_terminated_length": 183.25, + "epoch": 0.0775, + "grad_norm": 0.6442772150039673, + "kl": 0.1220703125, + "learning_rate": 1.54e-05, + "loss": -0.01, + "num_tokens": 6354678.0, + "reward": 2.03125, + "reward_std": 0.125, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.125, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 155, + "token_counts/after_target": 806.75, + "token_counts/after_think": 299.0, + "token_counts/before_target": 1079.25, + "token_counts/before_think": 4016.25 + }, + { + "avg_penalty/after_target": 2.3088817596435547, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4559837728738785, + "avg_penalty/before_think": 0.6052272468805313, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 641.75, + "completions/max_terminated_length": 641.75, + "completions/mean_length": 406.234375, + "completions/mean_terminated_length": 406.234375, + "completions/min_length": 152.25, + "completions/min_terminated_length": 152.25, + "epoch": 0.078, + "grad_norm": 0.08316917717456818, + "kl": 0.1103515625, + "learning_rate": 1.55e-05, + "loss": 0.0044, + "num_tokens": 6390885.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 156, + "token_counts/after_target": 822.5, + "token_counts/after_think": 428.5, + "token_counts/before_target": 1489.0, + "token_counts/before_think": 3759.75 + }, + { + "avg_penalty/after_target": 2.5529034435749054, + "avg_penalty/after_think": 3.810061037540436, + "avg_penalty/before_target": 0.4853617772459984, + "avg_penalty/before_think": 0.6238057315349579, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 732.25, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 397.3125, + "completions/mean_terminated_length": 389.01563262939453, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.0785, + "grad_norm": 0.6498315334320068, + "kl": 0.1153564453125, + "learning_rate": 1.5600000000000003e-05, + "loss": 0.0611, + "num_tokens": 6427513.0, + "reward": 1.98828125, + "reward_std": 0.1317135989665985, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.046875, + "step": 157, + "token_counts/after_target": 803.5, + "token_counts/after_think": 298.25, + "token_counts/before_target": 1114.75, + "token_counts/before_think": 4140.5 + }, + { + "avg_penalty/after_target": 1.940656065940857, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.6258449405431747, + "avg_penalty/before_think": 0.7259499728679657, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 804.75, + "completions/max_terminated_length": 722.5, + "completions/mean_length": 432.34375, + "completions/mean_terminated_length": 422.5218811035156, + "completions/min_length": 163.25, + "completions/min_terminated_length": 163.25, + "epoch": 0.079, + "grad_norm": 5.258955955505371, + "kl": 0.169189453125, + "learning_rate": 1.5700000000000002e-05, + "loss": 0.062, + "num_tokens": 6473199.0, + "reward": 2.08203125, + "reward_std": 0.2720847874879837, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.125, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.14789126068353653, + "rewards/tag_count_reward/mean": 0.97265625, + "rewards/tag_count_reward/std": 0.07094132527709007, + "step": 158, + "token_counts/after_target": 1363.25, + "token_counts/after_think": 362.75, + "token_counts/before_target": 1940.75, + "token_counts/before_think": 3250.75 + }, + { + "avg_penalty/after_target": 2.225364863872528, + "avg_penalty/after_think": 3.93208110332489, + "avg_penalty/before_target": 0.38020334020256996, + "avg_penalty/before_think": 0.5893860757350922, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.75, + "completions/max_terminated_length": 609.75, + "completions/mean_length": 380.578125, + "completions/mean_terminated_length": 380.578125, + "completions/min_length": 191.5, + "completions/min_terminated_length": 191.5, + "epoch": 0.0795, + "grad_norm": 0.9018574357032776, + "kl": 0.1126708984375, + "learning_rate": 1.58e-05, + "loss": 0.0807, + "num_tokens": 6507668.0, + "reward": 2.18359375, + "reward_std": 0.31801527738571167, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.23989029973745346, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.015625, + "step": 159, + "token_counts/after_target": 469.5, + "token_counts/after_think": 566.25, + "token_counts/before_target": 1256.25, + "token_counts/before_think": 3797.25 + }, + { + "avg_penalty/after_target": 2.241099774837494, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5533686876296997, + "avg_penalty/before_think": 0.6514104753732681, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 830.75, + "completions/max_terminated_length": 816.25, + "completions/mean_length": 524.953125, + "completions/mean_terminated_length": 507.80804443359375, + "completions/min_length": 240.5, + "completions/min_terminated_length": 240.5, + "epoch": 0.08, + "grad_norm": 0.7409011125564575, + "kl": 0.083251953125, + "learning_rate": 1.5900000000000004e-05, + "loss": 0.1191, + "num_tokens": 6551153.0, + "reward": 1.91796875, + "reward_std": 0.2786140739917755, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.21039126068353653, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.06822281517088413, + "step": 160, + "token_counts/after_target": 1224.5, + "token_counts/after_think": 713.5, + "token_counts/before_target": 1557.75, + "token_counts/before_think": 4903.5 + }, + { + "avg_penalty/after_target": 2.374497711658478, + "avg_penalty/after_think": 3.8846643567085266, + "avg_penalty/before_target": 0.41214805841445923, + "avg_penalty/before_think": 0.718703880906105, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 732.75, + "completions/max_terminated_length": 732.75, + "completions/mean_length": 375.75, + "completions/mean_terminated_length": 375.75, + "completions/min_length": 161.75, + "completions/min_terminated_length": 161.75, + "epoch": 0.0805, + "grad_norm": 0.07206668704748154, + "kl": 0.1121826171875, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.0045, + "num_tokens": 6585345.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 161, + "token_counts/after_target": 1137.5, + "token_counts/after_think": 184.75, + "token_counts/before_target": 1701.25, + "token_counts/before_think": 2988.5 + }, + { + "avg_penalty/after_target": 2.436151772737503, + "avg_penalty/after_think": 3.7210792303085327, + "avg_penalty/before_target": 0.3486901931464672, + "avg_penalty/before_think": 0.6561544686555862, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 597.25, + "completions/max_terminated_length": 597.25, + "completions/mean_length": 359.875, + "completions/mean_terminated_length": 359.875, + "completions/min_length": 175.25, + "completions/min_terminated_length": 175.25, + "epoch": 0.081, + "grad_norm": 1.1354438066482544, + "kl": 0.094970703125, + "learning_rate": 1.6100000000000002e-05, + "loss": 0.1162, + "num_tokens": 6617001.0, + "reward": 1.97265625, + "reward_std": 0.09426911175251007, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.03697281517088413, + "step": 162, + "token_counts/after_target": 1113.25, + "token_counts/after_think": 128.0, + "token_counts/before_target": 2123.75, + "token_counts/before_think": 2393.0 + }, + { + "avg_penalty/after_target": 2.7923015356063843, + "avg_penalty/after_think": 3.6951141357421875, + "avg_penalty/before_target": 0.27449386939406395, + "avg_penalty/before_think": 0.4264500066637993, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.75, + "completions/max_terminated_length": 531.75, + "completions/mean_length": 248.390625, + "completions/mean_terminated_length": 248.390625, + "completions/min_length": 104.5, + "completions/min_terminated_length": 104.5, + "epoch": 0.0815, + "grad_norm": 0.09759221971035004, + "kl": 0.1234130859375, + "learning_rate": 1.62e-05, + "loss": 0.0049, + "num_tokens": 6642690.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 163, + "token_counts/after_target": 363.25, + "token_counts/after_think": 48.0, + "token_counts/before_target": 1750.75, + "token_counts/before_think": 1812.25 + }, + { + "avg_penalty/after_target": 2.1664721369743347, + "avg_penalty/after_think": 3.758768856525421, + "avg_penalty/before_target": 0.377102255821228, + "avg_penalty/before_think": 0.6046418845653534, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 287.59375, + "completions/mean_terminated_length": 287.59375, + "completions/min_length": 129.75, + "completions/min_terminated_length": 129.75, + "epoch": 0.082, + "grad_norm": 0.6151363253593445, + "kl": 0.125, + "learning_rate": 1.63e-05, + "loss": 0.0472, + "num_tokens": 6673320.0, + "reward": 1.9765625, + "reward_std": 0.09375, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.03125, + "step": 164, + "token_counts/after_target": 611.5, + "token_counts/after_think": 262.5, + "token_counts/before_target": 2335.25, + "token_counts/before_think": 1392.25 + }, + { + "avg_penalty/after_target": 2.9153342843055725, + "avg_penalty/after_think": 3.774175465106964, + "avg_penalty/before_target": 0.332279309630394, + "avg_penalty/before_think": 0.5284441486001015, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.0, + "completions/max_terminated_length": 601.0, + "completions/mean_length": 314.09375, + "completions/mean_terminated_length": 314.09375, + "completions/min_length": 128.25, + "completions/min_terminated_length": 128.25, + "epoch": 0.0825, + "grad_norm": 0.09185215830802917, + "kl": 0.1171875, + "learning_rate": 1.64e-05, + "loss": 0.0047, + "num_tokens": 6702638.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 165, + "token_counts/after_target": 658.75, + "token_counts/after_think": 144.75, + "token_counts/before_target": 1898.25, + "token_counts/before_think": 2323.75 + }, + { + "avg_penalty/after_target": 2.9521531462669373, + "avg_penalty/after_think": 3.9250659942626953, + "avg_penalty/before_target": 0.30712219327688217, + "avg_penalty/before_think": 0.48223672062158585, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 241.703125, + "completions/mean_terminated_length": 241.703125, + "completions/min_length": 106.5, + "completions/min_terminated_length": 106.5, + "epoch": 0.083, + "grad_norm": 0.15195243060588837, + "kl": 0.146728515625, + "learning_rate": 1.65e-05, + "loss": 0.0059, + "num_tokens": 6726715.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 166, + "token_counts/after_target": 516.75, + "token_counts/after_think": 66.25, + "token_counts/before_target": 2081.5, + "token_counts/before_think": 1202.75 + }, + { + "avg_penalty/after_target": 2.1214706003665924, + "avg_penalty/after_think": 3.96720689535141, + "avg_penalty/before_target": 0.40122708678245544, + "avg_penalty/before_think": 0.6380761861801147, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.0, + "completions/max_terminated_length": 573.0, + "completions/mean_length": 333.046875, + "completions/mean_terminated_length": 333.046875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.0835, + "grad_norm": 0.5289731621742249, + "kl": 0.1273193359375, + "learning_rate": 1.66e-05, + "loss": -0.0258, + "num_tokens": 6758206.0, + "reward": 2.03125, + "reward_std": 0.08539125323295593, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 167, + "token_counts/after_target": 771.25, + "token_counts/after_think": 301.75, + "token_counts/before_target": 2616.0, + "token_counts/before_think": 1639.75 + }, + { + "avg_penalty/after_target": 2.253902018070221, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4852350950241089, + "avg_penalty/before_think": 0.7524099200963974, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 730.25, + "completions/max_terminated_length": 730.25, + "completions/mean_length": 406.640625, + "completions/mean_terminated_length": 406.640625, + "completions/min_length": 136.25, + "completions/min_terminated_length": 136.25, + "epoch": 0.084, + "grad_norm": 0.11990240961313248, + "kl": 0.1220703125, + "learning_rate": 1.67e-05, + "loss": 0.0049, + "num_tokens": 6793303.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 168, + "token_counts/after_target": 1472.5, + "token_counts/after_think": 263.75, + "token_counts/before_target": 2477.0, + "token_counts/before_think": 2293.0 + }, + { + "avg_penalty/after_target": 1.902614802122116, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5282309651374817, + "avg_penalty/before_think": 0.7032182067632675, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 790.25, + "completions/max_terminated_length": 790.25, + "completions/mean_length": 494.234375, + "completions/mean_terminated_length": 494.234375, + "completions/min_length": 199.5, + "completions/min_terminated_length": 199.5, + "epoch": 0.0845, + "grad_norm": 0.2885091006755829, + "kl": 0.11083984375, + "learning_rate": 1.6800000000000002e-05, + "loss": -0.0198, + "num_tokens": 6836358.0, + "reward": 1.98046875, + "reward_std": 0.078125, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.015625, + "step": 169, + "token_counts/after_target": 1930.25, + "token_counts/after_think": 332.25, + "token_counts/before_target": 1921.75, + "token_counts/before_think": 3723.5 + }, + { + "avg_penalty/after_target": 2.2639018893241882, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4796750396490097, + "avg_penalty/before_think": 0.5866171419620514, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 670.25, + "completions/max_terminated_length": 670.25, + "completions/mean_length": 454.609375, + "completions/mean_terminated_length": 454.609375, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.085, + "grad_norm": 0.7138770818710327, + "kl": 0.113525390625, + "learning_rate": 1.69e-05, + "loss": -0.0054, + "num_tokens": 6874669.0, + "reward": 2.0859375, + "reward_std": 0.26722075790166855, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.24866948276758194, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.03125, + "step": 170, + "token_counts/after_target": 1522.0, + "token_counts/after_think": 368.0, + "token_counts/before_target": 1986.75, + "token_counts/before_think": 3397.0 + }, + { + "avg_penalty/after_target": 1.7576183080673218, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.34829070791602135, + "avg_penalty/before_think": 0.6170357316732407, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 685.0, + "completions/max_terminated_length": 685.0, + "completions/mean_length": 419.0625, + "completions/mean_terminated_length": 419.0625, + "completions/min_length": 242.25, + "completions/min_terminated_length": 242.25, + "epoch": 0.0855, + "grad_norm": 0.5081784129142761, + "kl": 0.1064453125, + "learning_rate": 1.7e-05, + "loss": 0.0135, + "num_tokens": 6911057.0, + "reward": 1.99609375, + "reward_std": 0.015625, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.015625, + "step": 171, + "token_counts/after_target": 549.25, + "token_counts/after_think": 533.25, + "token_counts/before_target": 980.0, + "token_counts/before_think": 4642.5 + }, + { + "avg_penalty/after_target": 1.7798964381217957, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5292051434516907, + "avg_penalty/before_think": 0.6577524542808533, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 749.75, + "completions/max_terminated_length": 749.75, + "completions/mean_length": 529.8125, + "completions/mean_terminated_length": 529.8125, + "completions/min_length": 333.25, + "completions/min_terminated_length": 333.25, + "epoch": 0.086, + "grad_norm": 0.3797433078289032, + "kl": 0.118408203125, + "learning_rate": 1.7100000000000002e-05, + "loss": 0.021, + "num_tokens": 6953829.0, + "reward": 1.9765625, + "reward_std": 0.09375, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.03125, + "step": 172, + "token_counts/after_target": 1237.25, + "token_counts/after_think": 721.25, + "token_counts/before_target": 1647.75, + "token_counts/before_think": 4870.75 + }, + { + "avg_penalty/after_target": 1.8729398846626282, + "avg_penalty/after_think": 3.9528648257255554, + "avg_penalty/before_target": 0.42718636989593506, + "avg_penalty/before_think": 0.7413511723279953, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 907.5, + "completions/max_terminated_length": 883.5, + "completions/mean_length": 642.359375, + "completions/mean_terminated_length": 635.3614654541016, + "completions/min_length": 408.75, + "completions/min_terminated_length": 408.75, + "epoch": 0.0865, + "grad_norm": 0.5095343589782715, + "kl": 0.1180419921875, + "learning_rate": 1.72e-05, + "loss": 0.012, + "num_tokens": 7011324.0, + "reward": 2.00390625, + "reward_std": 0.18204622715711594, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.046875, + "step": 173, + "token_counts/after_target": 1623.75, + "token_counts/after_think": 772.25, + "token_counts/before_target": 1275.0, + "token_counts/before_think": 6606.75 + }, + { + "avg_penalty/after_target": 1.4860627949237823, + "avg_penalty/after_think": 3.9305770993232727, + "avg_penalty/before_target": 0.5633728429675102, + "avg_penalty/before_think": 0.7323218584060669, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 796.5, + "completions/max_terminated_length": 796.5, + "completions/mean_length": 551.96875, + "completions/mean_terminated_length": 551.96875, + "completions/min_length": 333.25, + "completions/min_terminated_length": 333.25, + "epoch": 0.087, + "grad_norm": 0.5005184412002563, + "kl": 0.1173095703125, + "learning_rate": 1.73e-05, + "loss": -0.0161, + "num_tokens": 7057498.0, + "reward": 2.03125, + "reward_std": 0.08539125323295593, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 174, + "token_counts/after_target": 1193.0, + "token_counts/after_think": 761.0, + "token_counts/before_target": 1249.25, + "token_counts/before_think": 5628.25 + }, + { + "avg_penalty/after_target": 1.3352841436862946, + "avg_penalty/after_think": 3.9618682265281677, + "avg_penalty/before_target": 0.5302209928631783, + "avg_penalty/before_think": 0.6818176060914993, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 888.25, + "completions/max_terminated_length": 878.75, + "completions/mean_length": 570.171875, + "completions/mean_terminated_length": 558.3370666503906, + "completions/min_length": 368.5, + "completions/min_terminated_length": 368.5, + "epoch": 0.0875, + "grad_norm": 0.477997750043869, + "kl": 0.1268310546875, + "learning_rate": 1.7400000000000003e-05, + "loss": 0.0338, + "num_tokens": 7102805.0, + "reward": 2.0234375, + "reward_std": 0.22204852104187012, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.1632782220840454, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.08539126068353653, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.021347815170884132, + "step": 175, + "token_counts/after_target": 730.25, + "token_counts/after_think": 1009.5, + "token_counts/before_target": 844.25, + "token_counts/before_think": 6538.75 + }, + { + "avg_penalty/after_target": 1.5753159672021866, + "avg_penalty/after_think": 3.89182311296463, + "avg_penalty/before_target": 0.7886732965707779, + "avg_penalty/before_think": 0.8041042387485504, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 945.75, + "completions/mean_length": 743.21875, + "completions/mean_terminated_length": 724.5000457763672, + "completions/min_length": 497.75, + "completions/min_terminated_length": 497.75, + "epoch": 0.088, + "grad_norm": 0.7037478089332581, + "kl": 0.10546875, + "learning_rate": 1.7500000000000002e-05, + "loss": 0.1013, + "num_tokens": 7160691.0, + "reward": 2.0078125, + "reward_std": 0.41618984937667847, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.125, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.25, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.09375, + "step": 176, + "token_counts/after_target": 1426.5, + "token_counts/after_think": 1226.0, + "token_counts/before_target": 771.25, + "token_counts/before_think": 8467.75 + }, + { + "avg_penalty/after_target": 1.5274848937988281, + "avg_penalty/after_think": 3.9713975191116333, + "avg_penalty/before_target": 0.6336059868335724, + "avg_penalty/before_think": 0.6623781770467758, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 864.5, + "completions/max_terminated_length": 836.75, + "completions/mean_length": 631.859375, + "completions/mean_terminated_length": 626.7833404541016, + "completions/min_length": 419.5, + "completions/min_terminated_length": 419.5, + "epoch": 0.0885, + "grad_norm": 0.8243037462234497, + "kl": 0.122802734375, + "learning_rate": 1.76e-05, + "loss": 0.0636, + "num_tokens": 7208538.0, + "reward": 1.8671875, + "reward_std": 0.4208250492811203, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.29930340498685837, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.09774631634354591, + "step": 177, + "token_counts/after_target": 713.25, + "token_counts/after_think": 1108.5, + "token_counts/before_target": 576.25, + "token_counts/before_think": 7711.75 + }, + { + "avg_penalty/after_target": 1.7173167765140533, + "avg_penalty/after_think": 3.74483585357666, + "avg_penalty/before_target": 0.4841815121471882, + "avg_penalty/before_think": 0.7302140295505524, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 857.75, + "completions/max_terminated_length": 856.25, + "completions/mean_length": 628.640625, + "completions/mean_terminated_length": 618.6346282958984, + "completions/min_length": 315.5, + "completions/min_terminated_length": 315.5, + "epoch": 0.089, + "grad_norm": 0.40461277961730957, + "kl": 0.11328125, + "learning_rate": 1.77e-05, + "loss": 0.0522, + "num_tokens": 7259635.0, + "reward": 1.98828125, + "reward_std": 0.22675099968910217, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.10077822208404541, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.025194555521011353, + "step": 178, + "token_counts/after_target": 1003.75, + "token_counts/after_think": 835.5, + "token_counts/before_target": 583.75, + "token_counts/before_think": 7635.25 + }, + { + "avg_penalty/after_target": 1.309057891368866, + "avg_penalty/after_think": 3.974149465560913, + "avg_penalty/before_target": 0.4835613965988159, + "avg_penalty/before_think": 0.6727772206068039, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 840.0, + "completions/max_terminated_length": 840.0, + "completions/mean_length": 610.4375, + "completions/mean_terminated_length": 610.4375, + "completions/min_length": 335.5, + "completions/min_terminated_length": 335.5, + "epoch": 0.0895, + "grad_norm": 0.3871749937534332, + "kl": 0.1324462890625, + "learning_rate": 1.7800000000000002e-05, + "loss": -0.0093, + "num_tokens": 7307663.0, + "reward": 2.078125, + "reward_std": 0.11967839300632477, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.11967839300632477, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 179, + "token_counts/after_target": 489.25, + "token_counts/after_think": 873.75, + "token_counts/before_target": 466.0, + "token_counts/before_think": 7938.0 + }, + { + "avg_penalty/after_target": 1.6493703424930573, + "avg_penalty/after_think": 3.9634212851524353, + "avg_penalty/before_target": 0.4357387349009514, + "avg_penalty/before_think": 0.5943887829780579, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 851.0, + "completions/max_terminated_length": 851.0, + "completions/mean_length": 549.109375, + "completions/mean_terminated_length": 549.109375, + "completions/min_length": 319.75, + "completions/min_terminated_length": 319.75, + "epoch": 0.09, + "grad_norm": 0.46161216497421265, + "kl": 0.125732421875, + "learning_rate": 1.79e-05, + "loss": 0.046, + "num_tokens": 7351558.0, + "reward": 2.046875, + "reward_std": 0.14789125323295593, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.14789126068353653, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 180, + "token_counts/after_target": 339.5, + "token_counts/after_think": 952.25, + "token_counts/before_target": 668.25, + "token_counts/before_think": 6825.75 + }, + { + "avg_penalty/after_target": 1.5848025381565094, + "avg_penalty/after_think": 3.952364206314087, + "avg_penalty/before_target": 0.42877186089754105, + "avg_penalty/before_think": 0.5866428762674332, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 801.0, + "completions/max_terminated_length": 801.0, + "completions/mean_length": 496.421875, + "completions/mean_terminated_length": 496.421875, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.0905, + "grad_norm": 0.535689651966095, + "kl": 0.1494140625, + "learning_rate": 1.8e-05, + "loss": 0.0016, + "num_tokens": 7391793.0, + "reward": 2.03125, + "reward_std": 0.08539125323295593, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 181, + "token_counts/after_target": 337.75, + "token_counts/after_think": 758.75, + "token_counts/before_target": 437.25, + "token_counts/before_think": 6409.0 + }, + { + "avg_penalty/after_target": 1.487826555967331, + "avg_penalty/after_think": 3.9573466181755066, + "avg_penalty/before_target": 0.3551987558603287, + "avg_penalty/before_think": 0.625763326883316, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 867.25, + "completions/max_terminated_length": 867.25, + "completions/mean_length": 552.859375, + "completions/mean_terminated_length": 552.859375, + "completions/min_length": 185.25, + "completions/min_terminated_length": 185.25, + "epoch": 0.091, + "grad_norm": 0.5623911023139954, + "kl": 0.142333984375, + "learning_rate": 1.8100000000000003e-05, + "loss": 0.0206, + "num_tokens": 7438888.0, + "reward": 1.99609375, + "reward_std": 0.140625, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.015625, + "step": 182, + "token_counts/after_target": 454.75, + "token_counts/after_think": 1158.25, + "token_counts/before_target": 540.75, + "token_counts/before_think": 6692.0 + }, + { + "avg_penalty/after_target": 1.8944500386714935, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 1.0088611617684364, + "avg_penalty/before_think": 0.6113651096820831, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 815.5, + "completions/max_terminated_length": 800.75, + "completions/mean_length": 533.890625, + "completions/mean_terminated_length": 520.5044708251953, + "completions/min_length": 285.5, + "completions/min_terminated_length": 285.5, + "epoch": 0.0915, + "grad_norm": 0.397124320268631, + "kl": 0.1490478515625, + "learning_rate": 1.8200000000000002e-05, + "loss": 0.0363, + "num_tokens": 7486097.0, + "reward": 2.046875, + "reward_std": 0.20476104319095612, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.125, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.08539126068353653, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.04841229319572449, + "step": 183, + "token_counts/after_target": 427.25, + "token_counts/after_think": 985.5, + "token_counts/before_target": 395.25, + "token_counts/before_think": 6734.25 + }, + { + "avg_penalty/after_target": 1.926129698753357, + "avg_penalty/after_think": 3.834779918193817, + "avg_penalty/before_target": 0.40161920338869095, + "avg_penalty/before_think": 0.5996511951088905, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 679.75, + "completions/max_terminated_length": 659.75, + "completions/mean_length": 433.4375, + "completions/mean_terminated_length": 426.0843811035156, + "completions/min_length": 164.5, + "completions/min_terminated_length": 164.5, + "epoch": 0.092, + "grad_norm": 0.6493479013442993, + "kl": 0.1572265625, + "learning_rate": 1.83e-05, + "loss": 0.0902, + "num_tokens": 7523933.0, + "reward": 2.16015625, + "reward_std": 0.29155339300632477, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.18217839300632477, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.046875, + "step": 184, + "token_counts/after_target": 422.0, + "token_counts/after_think": 678.75, + "token_counts/before_target": 825.0, + "token_counts/before_think": 5009.25 + }, + { + "avg_penalty/after_target": 1.8789711892604828, + "avg_penalty/after_think": 3.7174981832504272, + "avg_penalty/before_target": 0.33173539489507675, + "avg_penalty/before_think": 0.5723710879683495, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.0, + "completions/max_terminated_length": 638.0, + "completions/mean_length": 451.765625, + "completions/mean_terminated_length": 451.765625, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.0925, + "grad_norm": 0.8409323692321777, + "kl": 0.2008056640625, + "learning_rate": 1.8400000000000003e-05, + "loss": 0.008, + "num_tokens": 7562126.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 185, + "token_counts/after_target": 351.5, + "token_counts/after_think": 845.0, + "token_counts/before_target": 721.5, + "token_counts/before_think": 5310.25 + }, + { + "avg_penalty/after_target": 2.3213887214660645, + "avg_penalty/after_think": 3.8161906599998474, + "avg_penalty/before_target": 0.5837521776556969, + "avg_penalty/before_think": 0.6037550717592239, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 661.75, + "completions/max_terminated_length": 636.25, + "completions/mean_length": 477.21875, + "completions/mean_terminated_length": 463.5420684814453, + "completions/min_length": 301.25, + "completions/min_terminated_length": 301.25, + "epoch": 0.093, + "grad_norm": 0.7531533241271973, + "kl": 0.1373291015625, + "learning_rate": 1.8500000000000002e-05, + "loss": 0.0217, + "num_tokens": 7602620.0, + "reward": 1.7890625, + "reward_std": 0.38293883204460144, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.33226002007722855, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.06520001962780952, + "step": 186, + "token_counts/after_target": 812.0, + "token_counts/after_think": 610.75, + "token_counts/before_target": 747.0, + "token_counts/before_think": 5465.75 + }, + { + "avg_penalty/after_target": 1.4564403891563416, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3632310479879379, + "avg_penalty/before_think": 0.5993103682994843, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 635.0, + "completions/max_terminated_length": 635.0, + "completions/mean_length": 411.0, + "completions/mean_terminated_length": 411.0, + "completions/min_length": 135.75, + "completions/min_terminated_length": 135.75, + "epoch": 0.0935, + "grad_norm": 0.8462493419647217, + "kl": 0.166015625, + "learning_rate": 1.86e-05, + "loss": 0.0251, + "num_tokens": 7636620.0, + "reward": 1.609375, + "reward_std": 0.5234046429395676, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.49776528775691986, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.049619100987911224, + "step": 187, + "token_counts/after_target": 478.0, + "token_counts/after_think": 593.5, + "token_counts/before_target": 1128.5, + "token_counts/before_think": 4376.0 + }, + { + "avg_penalty/after_target": 1.9616200029850006, + "avg_penalty/after_think": 3.610498309135437, + "avg_penalty/before_target": 0.4758821465075016, + "avg_penalty/before_think": 0.5080729722976685, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.25, + "completions/max_terminated_length": 599.25, + "completions/mean_length": 379.734375, + "completions/mean_terminated_length": 379.734375, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.094, + "grad_norm": 0.8689534664154053, + "kl": 0.156494140625, + "learning_rate": 1.8700000000000004e-05, + "loss": 0.0832, + "num_tokens": 7669371.0, + "reward": 1.6640625, + "reward_std": 0.6647568345069885, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.20155644416809082, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.4939897432923317, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.12770001962780952, + "step": 188, + "token_counts/after_target": 755.5, + "token_counts/after_think": 352.0, + "token_counts/before_target": 1274.25, + "token_counts/before_think": 3694.0 + }, + { + "avg_penalty/after_target": 1.6219394505023956, + "avg_penalty/after_think": 3.921324670314789, + "avg_penalty/before_target": 0.4046870023012161, + "avg_penalty/before_think": 0.591826006770134, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.75, + "completions/max_terminated_length": 613.75, + "completions/mean_length": 391.890625, + "completions/mean_terminated_length": 391.890625, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.0945, + "grad_norm": 0.9789850115776062, + "kl": 0.189697265625, + "learning_rate": 1.88e-05, + "loss": 0.0669, + "num_tokens": 7707876.0, + "reward": 1.546875, + "reward_std": 0.6176556348800659, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.4876555874943733, + "rewards/tag_count_reward/mean": 0.875, + "rewards/tag_count_reward/std": 0.24698643758893013, + "step": 189, + "token_counts/after_target": 836.5, + "token_counts/after_think": 291.75, + "token_counts/before_target": 1863.0, + "token_counts/before_think": 3279.0 + }, + { + "avg_penalty/after_target": 2.291228413581848, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4168955683708191, + "avg_penalty/before_think": 0.6298515722155571, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 804.5, + "completions/max_terminated_length": 804.5, + "completions/mean_length": 455.203125, + "completions/mean_terminated_length": 455.203125, + "completions/min_length": 143.75, + "completions/min_terminated_length": 143.75, + "epoch": 0.095, + "grad_norm": 0.7953512072563171, + "kl": 0.187744140625, + "learning_rate": 1.8900000000000002e-05, + "loss": 0.0788, + "num_tokens": 7748065.0, + "reward": 1.73046875, + "reward_std": 0.5264168158173561, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4141380712389946, + "rewards/tag_count_reward/mean": 0.88671875, + "rewards/tag_count_reward/std": 0.24241185560822487, + "step": 190, + "token_counts/after_target": 1236.5, + "token_counts/after_think": 444.5, + "token_counts/before_target": 1737.5, + "token_counts/before_think": 3864.75 + }, + { + "avg_penalty/after_target": 2.4263182878494263, + "avg_penalty/after_think": 3.867701470851898, + "avg_penalty/before_target": 0.4319103732705116, + "avg_penalty/before_think": 0.49033618718385696, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.75, + "completions/max_terminated_length": 481.75, + "completions/mean_length": 261.859375, + "completions/mean_terminated_length": 261.859375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.0955, + "grad_norm": 0.9641184210777283, + "kl": 0.17333984375, + "learning_rate": 1.9e-05, + "loss": -0.1127, + "num_tokens": 7774088.0, + "reward": 1.80078125, + "reward_std": 0.41375236213207245, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.3604728877544403, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.12303955852985382, + "step": 191, + "token_counts/after_target": 565.25, + "token_counts/after_think": 109.75, + "token_counts/before_target": 1678.75, + "token_counts/before_think": 1836.0 + }, + { + "avg_penalty/after_target": 2.295907437801361, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3297918103635311, + "avg_penalty/before_think": 0.5040175095200539, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.75, + "completions/max_terminated_length": 513.75, + "completions/mean_length": 262.734375, + "completions/mean_terminated_length": 262.734375, + "completions/min_length": 56.75, + "completions/min_terminated_length": 56.75, + "epoch": 0.096, + "grad_norm": 1.2919073104858398, + "kl": 0.174072265625, + "learning_rate": 1.91e-05, + "loss": 0.1242, + "num_tokens": 7799511.0, + "reward": 1.94921875, + "reward_std": 0.14515279233455658, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.14789126068353653, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.06403729319572449, + "step": 192, + "token_counts/after_target": 542.5, + "token_counts/after_think": 137.25, + "token_counts/before_target": 1839.75, + "token_counts/before_think": 1684.25 + }, + { + "avg_penalty/after_target": 2.416461020708084, + "avg_penalty/after_think": 3.8254472613334656, + "avg_penalty/before_target": 0.4943503141403198, + "avg_penalty/before_think": 0.6677247583866119, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.75, + "completions/max_terminated_length": 628.75, + "completions/mean_length": 278.859375, + "completions/mean_terminated_length": 278.859375, + "completions/min_length": 74.25, + "completions/min_terminated_length": 74.25, + "epoch": 0.0965, + "grad_norm": 1.02310049533844, + "kl": 0.177490234375, + "learning_rate": 1.9200000000000003e-05, + "loss": 0.0021, + "num_tokens": 7828814.0, + "reward": 1.89453125, + "reward_std": 0.3477766066789627, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2882782220840454, + "rewards/tag_count_reward/mean": 0.95703125, + "rewards/tag_count_reward/std": 0.15899410098791122, + "step": 193, + "token_counts/after_target": 791.75, + "token_counts/after_think": 255.25, + "token_counts/before_target": 1834.75, + "token_counts/before_think": 1580.0 + }, + { + "avg_penalty/after_target": 2.2736714482307434, + "avg_penalty/after_think": 3.632802724838257, + "avg_penalty/before_target": 0.3471837341785431, + "avg_penalty/before_think": 0.5342912822961807, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 239.90625, + "completions/mean_terminated_length": 239.90625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.097, + "grad_norm": 1.3265703916549683, + "kl": 0.22265625, + "learning_rate": 1.93e-05, + "loss": 0.0267, + "num_tokens": 7851704.0, + "reward": 1.953125, + "reward_std": 0.2585911601781845, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.14789126068353653, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.049619100987911224, + "step": 194, + "token_counts/after_target": 494.75, + "token_counts/after_think": 79.5, + "token_counts/before_target": 1536.0, + "token_counts/before_think": 1728.25 + }, + { + "avg_penalty/after_target": 2.2887311577796936, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3424973636865616, + "avg_penalty/before_think": 0.5965477973222733, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.5, + "completions/max_terminated_length": 580.5, + "completions/mean_length": 346.234375, + "completions/mean_terminated_length": 346.234375, + "completions/min_length": 103.25, + "completions/min_terminated_length": 103.25, + "epoch": 0.0975, + "grad_norm": 0.3559892475605011, + "kl": 0.18701171875, + "learning_rate": 1.94e-05, + "loss": -0.0168, + "num_tokens": 7885095.0, + "reward": 1.9921875, + "reward_std": 0.03125, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.03125, + "step": 195, + "token_counts/after_target": 753.0, + "token_counts/after_think": 292.5, + "token_counts/before_target": 2098.25, + "token_counts/before_think": 2396.0 + }, + { + "avg_penalty/after_target": 2.6557331681251526, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.33575981110334396, + "avg_penalty/before_think": 0.6750686913728714, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.75, + "completions/max_terminated_length": 700.75, + "completions/mean_length": 367.953125, + "completions/mean_terminated_length": 367.953125, + "completions/min_length": 100.5, + "completions/min_terminated_length": 100.5, + "epoch": 0.098, + "grad_norm": 0.96341472864151, + "kl": 0.1953125, + "learning_rate": 1.95e-05, + "loss": 0.0012, + "num_tokens": 7918420.0, + "reward": 1.92578125, + "reward_std": 0.2529904991388321, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.21039126068353653, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.046875, + "step": 196, + "token_counts/after_target": 1249.25, + "token_counts/after_think": 251.0, + "token_counts/before_target": 1990.0, + "token_counts/before_think": 2397.0 + }, + { + "avg_penalty/after_target": 2.3404776453971863, + "avg_penalty/after_think": 3.9439111948013306, + "avg_penalty/before_target": 0.39227619022130966, + "avg_penalty/before_think": 0.6483332216739655, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 357.765625, + "completions/mean_terminated_length": 357.765625, + "completions/min_length": 111.5, + "completions/min_terminated_length": 111.5, + "epoch": 0.0985, + "grad_norm": 0.7978860139846802, + "kl": 0.181884765625, + "learning_rate": 1.9600000000000002e-05, + "loss": -0.0076, + "num_tokens": 7954597.0, + "reward": 2.08203125, + "reward_std": 0.39775095880031586, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.2288651168346405, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.21039126068353653, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.033994100987911224, + "step": 197, + "token_counts/after_target": 1030.0, + "token_counts/after_think": 256.0, + "token_counts/before_target": 1479.25, + "token_counts/before_think": 2959.0 + }, + { + "avg_penalty/after_target": 1.9327960312366486, + "avg_penalty/after_think": 3.8766905069351196, + "avg_penalty/before_target": 0.4544408544898033, + "avg_penalty/before_think": 0.744233176112175, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 762.5, + "completions/max_terminated_length": 762.5, + "completions/mean_length": 457.953125, + "completions/mean_terminated_length": 457.953125, + "completions/min_length": 195.25, + "completions/min_terminated_length": 195.25, + "epoch": 0.099, + "grad_norm": 0.8458226919174194, + "kl": 0.22412109375, + "learning_rate": 1.97e-05, + "loss": 0.0322, + "num_tokens": 7992338.0, + "reward": 1.94140625, + "reward_std": 0.234375, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.1875, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.046875, + "step": 198, + "token_counts/after_target": 1439.0, + "token_counts/after_think": 401.75, + "token_counts/before_target": 2010.5, + "token_counts/before_think": 3476.0 + }, + { + "avg_penalty/after_target": 1.9878981113433838, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4867623746395111, + "avg_penalty/before_think": 0.7234575003385544, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 683.5, + "completions/max_terminated_length": 683.5, + "completions/mean_length": 456.90625, + "completions/mean_terminated_length": 456.90625, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.0995, + "grad_norm": 0.560194730758667, + "kl": 0.219970703125, + "learning_rate": 1.98e-05, + "loss": -0.0043, + "num_tokens": 8031212.0, + "reward": 1.953125, + "reward_std": 0.1875, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.0625, + "step": 199, + "token_counts/after_target": 1442.25, + "token_counts/after_think": 283.0, + "token_counts/before_target": 2618.75, + "token_counts/before_think": 2966.5 + }, + { + "avg_penalty/after_target": 2.3642702400684357, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4553295597434044, + "avg_penalty/before_think": 0.7849224507808685, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 864.5, + "completions/max_terminated_length": 864.5, + "completions/mean_length": 547.796875, + "completions/mean_terminated_length": 547.796875, + "completions/min_length": 266.75, + "completions/min_terminated_length": 266.75, + "epoch": 0.1, + "grad_norm": 0.8071867823600769, + "kl": 0.219482421875, + "learning_rate": 1.9900000000000003e-05, + "loss": 0.0593, + "num_tokens": 8080623.0, + "reward": 1.99609375, + "reward_std": 0.140625, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.015625, + "step": 200, + "token_counts/after_target": 1788.0, + "token_counts/after_think": 467.5, + "token_counts/before_target": 2047.25, + "token_counts/before_think": 4462.0 + }, + { + "avg_penalty/after_target": 1.8650929927825928, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.44243223965168, + "avg_penalty/before_think": 0.7537427544593811, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 874.75, + "completions/max_terminated_length": 797.75, + "completions/mean_length": 540.625, + "completions/mean_terminated_length": 531.8406295776367, + "completions/min_length": 275.25, + "completions/min_terminated_length": 275.25, + "epoch": 0.1005, + "grad_norm": 0.7290116548538208, + "kl": 0.2197265625, + "learning_rate": 2e-05, + "loss": 0.0655, + "num_tokens": 8124071.0, + "reward": 1.97265625, + "reward_std": 0.1942135989665985, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.046875, + "step": 201, + "token_counts/after_target": 1704.25, + "token_counts/after_think": 663.75, + "token_counts/before_target": 1680.25, + "token_counts/before_think": 4601.75 + }, + { + "avg_penalty/after_target": 1.9209394752979279, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4632141813635826, + "avg_penalty/before_think": 0.804025262594223, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 803.25, + "completions/max_terminated_length": 784.5, + "completions/mean_length": 572.5, + "completions/mean_terminated_length": 561.2388458251953, + "completions/min_length": 272.5, + "completions/min_terminated_length": 272.5, + "epoch": 0.101, + "grad_norm": 0.6676571369171143, + "kl": 0.213134765625, + "learning_rate": 1.999998476913288e-05, + "loss": 0.0547, + "num_tokens": 8168583.0, + "reward": 2.0, + "reward_std": 0.3140576481819153, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.17430340498685837, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.11180340498685837, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.027950851246714592, + "step": 202, + "token_counts/after_target": 1951.0, + "token_counts/after_think": 646.25, + "token_counts/before_target": 1718.25, + "token_counts/before_think": 4844.5 + }, + { + "avg_penalty/after_target": 2.0543223321437836, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4237484782934189, + "avg_penalty/before_think": 0.6792815327644348, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 683.5, + "completions/max_terminated_length": 683.5, + "completions/mean_length": 471.71875, + "completions/mean_terminated_length": 471.71875, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.1015, + "grad_norm": 0.5568601489067078, + "kl": 0.221923828125, + "learning_rate": 1.9999939076577906e-05, + "loss": 0.0196, + "num_tokens": 8208485.0, + "reward": 2.09375, + "reward_std": 0.18217839300632477, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.18217839300632477, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 203, + "token_counts/after_target": 1317.5, + "token_counts/after_think": 557.5, + "token_counts/before_target": 1908.25, + "token_counts/before_think": 3764.25 + }, + { + "avg_penalty/after_target": 1.7681753039360046, + "avg_penalty/after_think": 3.829704523086548, + "avg_penalty/before_target": 0.5221203006803989, + "avg_penalty/before_think": 0.7295272946357727, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 843.5, + "completions/max_terminated_length": 843.5, + "completions/mean_length": 574.921875, + "completions/mean_terminated_length": 568.9677124023438, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.102, + "grad_norm": 0.7812898755073547, + "kl": 0.2373046875, + "learning_rate": 1.999986292247427e-05, + "loss": 0.018, + "num_tokens": 8253936.0, + "reward": 1.97265625, + "reward_std": 0.234375, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.046875, + "step": 204, + "token_counts/after_target": 1280.25, + "token_counts/after_think": 839.0, + "token_counts/before_target": 1234.25, + "token_counts/before_think": 5845.25 + }, + { + "avg_penalty/after_target": 1.7712447047233582, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.49574510008096695, + "avg_penalty/before_think": 0.6541565805673599, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 809.25, + "completions/max_terminated_length": 809.25, + "completions/mean_length": 555.34375, + "completions/mean_terminated_length": 555.34375, + "completions/min_length": 389.25, + "completions/min_terminated_length": 389.25, + "epoch": 0.1025, + "grad_norm": 0.4204584062099457, + "kl": 0.251953125, + "learning_rate": 1.9999756307053947e-05, + "loss": 0.0152, + "num_tokens": 8299766.0, + "reward": 2.109375, + "reward_std": 0.1280868798494339, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.1280868947505951, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 205, + "token_counts/after_target": 1152.75, + "token_counts/after_think": 870.0, + "token_counts/before_target": 1001.75, + "token_counts/before_think": 5861.0 + }, + { + "avg_penalty/after_target": 1.5322927236557007, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4647107198834419, + "avg_penalty/before_think": 0.6552916169166565, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 753.75, + "completions/max_terminated_length": 753.75, + "completions/mean_length": 511.515625, + "completions/mean_terminated_length": 511.515625, + "completions/min_length": 293.75, + "completions/min_terminated_length": 293.75, + "epoch": 0.103, + "grad_norm": 0.5449636578559875, + "kl": 0.24169921875, + "learning_rate": 1.9999619230641714e-05, + "loss": 0.0117, + "num_tokens": 8345735.0, + "reward": 2.109375, + "reward_std": 0.1280868798494339, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.1280868947505951, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 206, + "token_counts/after_target": 897.5, + "token_counts/after_think": 838.5, + "token_counts/before_target": 956.75, + "token_counts/before_think": 5491.5 + }, + { + "avg_penalty/after_target": 1.6436190903186798, + "avg_penalty/after_think": 3.9840258955955505, + "avg_penalty/before_target": 0.31511645019054413, + "avg_penalty/before_think": 0.6267484426498413, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 837.75, + "completions/max_terminated_length": 837.75, + "completions/mean_length": 551.046875, + "completions/mean_terminated_length": 551.046875, + "completions/min_length": 361.5, + "completions/min_terminated_length": 361.5, + "epoch": 0.1035, + "grad_norm": 0.19922299683094025, + "kl": 0.30615234375, + "learning_rate": 1.9999451693655125e-05, + "loss": 0.0123, + "num_tokens": 8391290.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 207, + "token_counts/after_target": 928.75, + "token_counts/after_think": 1010.75, + "token_counts/before_target": 474.0, + "token_counts/before_think": 6403.25 + }, + { + "avg_penalty/after_target": 2.190713346004486, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.9947011321783066, + "avg_penalty/before_think": 0.6214662045240402, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 973.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 610.953125, + "completions/mean_terminated_length": 553.3839111328125, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.104, + "grad_norm": 0.9452558159828186, + "kl": 0.28662109375, + "learning_rate": 1.9999253696604522e-05, + "loss": 0.1935, + "num_tokens": 8438215.0, + "reward": 1.703125, + "reward_std": 0.573740616440773, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.38724804669618607, + "rewards/tag_count_reward/mean": 0.90625, + "rewards/tag_count_reward/std": 0.20574817061424255, + "step": 208, + "token_counts/after_target": 1566.5, + "token_counts/after_think": 944.0, + "token_counts/before_target": 1167.25, + "token_counts/before_think": 6097.5 + }, + { + "avg_penalty/after_target": 2.124800682067871, + "avg_penalty/after_think": 3.9177361130714417, + "avg_penalty/before_target": 0.5868020132184029, + "avg_penalty/before_think": 0.6530029773712158, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 964.75, + "completions/max_terminated_length": 824.25, + "completions/mean_length": 577.359375, + "completions/mean_terminated_length": 555.2281494140625, + "completions/min_length": 293.25, + "completions/min_terminated_length": 293.25, + "epoch": 0.1045, + "grad_norm": 1.0696425437927246, + "kl": 0.243408203125, + "learning_rate": 1.9999025240093045e-05, + "loss": 0.1225, + "num_tokens": 8489070.0, + "reward": 1.91796875, + "reward_std": 0.43442782759666443, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.23328252136707306, + "rewards/tag_count_reward/mean": 0.96484375, + "rewards/tag_count_reward/std": 0.12590491026639938, + "step": 209, + "token_counts/after_target": 1183.75, + "token_counts/after_think": 856.75, + "token_counts/before_target": 583.75, + "token_counts/before_think": 6613.5 + }, + { + "avg_penalty/after_target": 1.851302295923233, + "avg_penalty/after_think": 3.9391598105430603, + "avg_penalty/before_target": 0.38257431983947754, + "avg_penalty/before_think": 0.5735285580158234, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 587.5, + "completions/max_terminated_length": 587.5, + "completions/mean_length": 371.859375, + "completions/mean_terminated_length": 371.859375, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.105, + "grad_norm": 1.0334389209747314, + "kl": 0.36767578125, + "learning_rate": 1.9998766324816606e-05, + "loss": 0.0364, + "num_tokens": 8520565.0, + "reward": 1.98046875, + "reward_std": 0.34171973168849945, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.11180340498685837, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.21039126068353653, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.06524410098791122, + "step": 210, + "token_counts/after_target": 518.5, + "token_counts/after_think": 413.25, + "token_counts/before_target": 785.5, + "token_counts/before_think": 4232.5 + }, + { + "avg_penalty/after_target": 1.910911351442337, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4730774313211441, + "avg_penalty/before_think": 0.6184807494282722, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 804.75, + "completions/max_terminated_length": 623.75, + "completions/mean_length": 461.71875, + "completions/mean_terminated_length": 421.7554626464844, + "completions/min_length": 254.25, + "completions/min_terminated_length": 254.25, + "epoch": 0.1055, + "grad_norm": 1.472874641418457, + "kl": 0.4130859375, + "learning_rate": 1.9998476951563914e-05, + "loss": 0.1534, + "num_tokens": 8558371.0, + "reward": 1.4140625, + "reward_std": 0.5160187184810638, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.34013500809669495, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.21705039031803608, + "step": 211, + "token_counts/after_target": 1401.75, + "token_counts/after_think": 456.25, + "token_counts/before_target": 2827.5, + "token_counts/before_think": 2702.0 + }, + { + "avg_penalty/after_target": 2.390858471393585, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.6785416305065155, + "avg_penalty/before_think": 0.5212029740214348, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 710.5, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 377.734375, + "completions/mean_terminated_length": 348.41876220703125, + "completions/min_length": 202.75, + "completions/min_terminated_length": 202.75, + "epoch": 0.106, + "grad_norm": 3.2497708797454834, + "kl": 0.56298828125, + "learning_rate": 1.9998157121216442e-05, + "loss": 0.2045, + "num_tokens": 8592690.0, + "reward": 1.85546875, + "reward_std": 0.315752848982811, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.20155644416809082, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.1213802918791771, + "step": 212, + "token_counts/after_target": 1132.5, + "token_counts/after_think": 368.75, + "token_counts/before_target": 1040.0, + "token_counts/before_think": 3502.5 + }, + { + "avg_penalty/after_target": 3.1911020278930664, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 1.0118854194879532, + "avg_penalty/before_think": 0.6493184715509415, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 613.75, + "completions/mean_length": 524.296875, + "completions/mean_terminated_length": 332.2544403076172, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.1065, + "grad_norm": 9.955439567565918, + "kl": 1.7109375, + "learning_rate": 1.9997806834748455e-05, + "loss": 0.559, + "num_tokens": 8634661.0, + "reward": 1.32421875, + "reward_std": 0.8507693558931351, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.4546433389186859, + "rewards/tag_count_reward/mean": 0.68359375, + "rewards/tag_count_reward/std": 0.4107612892985344, + "step": 213, + "token_counts/after_target": 3404.0, + "token_counts/after_think": 201.75, + "token_counts/before_target": 2542.75, + "token_counts/before_think": 2240.25 + }, + { + "avg_penalty/after_target": 2.6513707637786865, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 1.0575608611106873, + "avg_penalty/before_think": 0.9073128551244736, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 876.0, + "completions/mean_length": 686.734375, + "completions/mean_terminated_length": 478.7377014160156, + "completions/min_length": 261.25, + "completions/min_terminated_length": 261.25, + "epoch": 0.107, + "grad_norm": 6.569565773010254, + "kl": 3.04296875, + "learning_rate": 1.9997426093226984e-05, + "loss": 0.4717, + "num_tokens": 8686564.0, + "reward": 1.26953125, + "reward_std": 0.8146720975637436, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.53125, + "rewards/format_reward/std": 0.5122983306646347, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.3396424725651741, + "step": 214, + "token_counts/after_target": 5000.25, + "token_counts/after_think": 318.0, + "token_counts/before_target": 2940.5, + "token_counts/before_think": 2729.0 + }, + { + "avg_penalty/after_target": 2.4181312322616577, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 1.1938143968582153, + "avg_penalty/before_think": 1.6485716104507446, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 865.25, + "completions/mean_length": 797.8125, + "completions/mean_terminated_length": 477.3583450317383, + "completions/min_length": 216.25, + "completions/min_terminated_length": 216.25, + "epoch": 0.1075, + "grad_norm": 11.798130989074707, + "kl": 8.4375, + "learning_rate": 1.9997014897811834e-05, + "loss": 0.5793, + "num_tokens": 8747768.0, + "reward": 0.60546875, + "reward_std": 0.5322037637233734, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.2825859263539314, + "rewards/tag_count_reward/mean": 0.48046875, + "rewards/tag_count_reward/std": 0.2912844195961952, + "step": 215, + "token_counts/after_target": 6388.0, + "token_counts/after_think": 191.75, + "token_counts/before_target": 4301.75, + "token_counts/before_think": 1883.5 + }, + { + "avg_penalty/after_target": 2.327628403902054, + "avg_penalty/after_think": 3.9581969380378723, + "avg_penalty/before_target": 0.767740398645401, + "avg_penalty/before_think": 0.9806979447603226, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 688.75, + "completions/mean_length": 539.578125, + "completions/mean_terminated_length": 319.1145896911621, + "completions/min_length": 133.25, + "completions/min_terminated_length": 133.25, + "epoch": 0.108, + "grad_norm": 11.708727836608887, + "kl": 4.55859375, + "learning_rate": 1.9996573249755573e-05, + "loss": 0.5732, + "num_tokens": 8792061.0, + "reward": 1.1875, + "reward_std": 0.7134545147418976, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.421875, + "rewards/format_reward/std": 0.48558124154806137, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.2789936251938343, + "step": 216, + "token_counts/after_target": 3226.25, + "token_counts/after_think": 225.5, + "token_counts/before_target": 2819.5, + "token_counts/before_think": 2362.0 + }, + { + "avg_penalty/after_target": 2.4775474667549133, + "avg_penalty/after_think": 3.9333786964416504, + "avg_penalty/before_target": 0.9892318099737167, + "avg_penalty/before_think": 1.2955666184425354, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 321.5, + "completions/mean_length": 581.484375, + "completions/mean_terminated_length": 192.62004280090332, + "completions/min_length": 113.25, + "completions/min_terminated_length": 113.25, + "epoch": 0.1085, + "grad_norm": 30.94203758239746, + "kl": 18.9375, + "learning_rate": 1.9996101150403543e-05, + "loss": 1.1767, + "num_tokens": 8837740.0, + "reward": 0.984375, + "reward_std": 0.7104576975107193, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.296875, + "rewards/format_reward/std": 0.46513500809669495, + "rewards/tag_count_reward/mean": 0.6875, + "rewards/tag_count_reward/std": 0.31086745113134384, + "step": 217, + "token_counts/after_target": 4241.75, + "token_counts/after_think": 158.0, + "token_counts/before_target": 3508.25, + "token_counts/before_think": 1395.75 + }, + { + "avg_penalty/after_target": 2.6148698925971985, + "avg_penalty/after_think": 3.816972076892853, + "avg_penalty/before_target": 1.0713139772415161, + "avg_penalty/before_think": 1.5412930250167847, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 221.5, + "completions/mean_length": 689.71875, + "completions/mean_terminated_length": 171.20208740234375, + "completions/min_length": 126.5, + "completions/min_terminated_length": 126.5, + "epoch": 0.109, + "grad_norm": 6.7524333000183105, + "kl": 8.0078125, + "learning_rate": 1.9995598601193842e-05, + "loss": 0.8011, + "num_tokens": 8890330.0, + "reward": 1.05078125, + "reward_std": 0.633560374379158, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.296875, + "rewards/format_reward/std": 0.45028156042099, + "rewards/tag_count_reward/mean": 0.75390625, + "rewards/tag_count_reward/std": 0.23757071420550346, + "step": 218, + "token_counts/after_target": 5656.25, + "token_counts/after_think": 104.75, + "token_counts/before_target": 3654.0, + "token_counts/before_think": 1620.5 + }, + { + "avg_penalty/after_target": 2.27106511592865, + "avg_penalty/after_think": 3.897931456565857, + "avg_penalty/before_target": 0.7056152373552322, + "avg_penalty/before_think": 1.1166547909379005, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 260.25, + "completions/mean_length": 435.703125, + "completions/mean_terminated_length": 189.8473014831543, + "completions/min_length": 83.25, + "completions/min_terminated_length": 83.25, + "epoch": 0.1095, + "grad_norm": 7.34257173538208, + "kl": 8.234375, + "learning_rate": 1.9995065603657317e-05, + "loss": 0.8876, + "num_tokens": 8929495.0, + "reward": 1.4296875, + "reward_std": 0.6126192361116409, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.5625, + "rewards/format_reward/std": 0.46566852182149887, + "rewards/tag_count_reward/mean": 0.8671875, + "rewards/tag_count_reward/std": 0.18261515349149704, + "step": 219, + "token_counts/after_target": 2807.5, + "token_counts/after_think": 114.0, + "token_counts/before_target": 2278.25, + "token_counts/before_think": 1771.5 + }, + { + "avg_penalty/after_target": 2.5781786143779755, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.7002052962779999, + "avg_penalty/before_think": 0.9309875518083572, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 641.5, + "completions/mean_length": 419.0, + "completions/mean_terminated_length": 268.4334545135498, + "completions/min_length": 113.25, + "completions/min_terminated_length": 113.25, + "epoch": 0.11, + "grad_norm": 8.287986755371094, + "kl": 12.515625, + "learning_rate": 1.9994502159417576e-05, + "loss": 1.0762, + "num_tokens": 8965927.0, + "reward": 1.375, + "reward_std": 0.7373226135969162, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.515625, + "rewards/format_reward/std": 0.49654312431812286, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.2093881294131279, + "step": 220, + "token_counts/after_target": 2278.5, + "token_counts/after_think": 208.0, + "token_counts/before_target": 2411.0, + "token_counts/before_think": 1806.5 + }, + { + "avg_penalty/after_target": 2.7584334313869476, + "avg_penalty/after_think": 3.932421565055847, + "avg_penalty/before_target": 0.4883626960217953, + "avg_penalty/before_think": 0.7249568402767181, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 915.25, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 346.390625, + "completions/mean_terminated_length": 221.62054061889648, + "completions/min_length": 62.75, + "completions/min_terminated_length": 62.75, + "epoch": 0.1105, + "grad_norm": 5.12542724609375, + "kl": 7.3984375, + "learning_rate": 1.999390827019096e-05, + "loss": 0.733, + "num_tokens": 9000640.0, + "reward": 1.58984375, + "reward_std": 0.5991882532835007, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4613594636321068, + "rewards/tag_count_reward/mean": 0.90234375, + "rewards/tag_count_reward/std": 0.1551767848432064, + "step": 221, + "token_counts/after_target": 1575.75, + "token_counts/after_think": 201.0, + "token_counts/before_target": 2213.75, + "token_counts/before_think": 1551.75 + }, + { + "avg_penalty/after_target": 2.2687445878982544, + "avg_penalty/after_think": 3.827000141143799, + "avg_penalty/before_target": 0.6239387467503548, + "avg_penalty/before_think": 0.773522861301899, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 915.25, + "completions/max_terminated_length": 485.5, + "completions/mean_length": 374.4375, + "completions/mean_terminated_length": 251.8134651184082, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.111, + "grad_norm": 5.735850811004639, + "kl": 9.3828125, + "learning_rate": 1.9993283937786562e-05, + "loss": 0.7691, + "num_tokens": 9041052.0, + "reward": 1.5390625, + "reward_std": 0.6149772256612778, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.4776429533958435, + "rewards/tag_count_reward/mean": 0.8984375, + "rewards/tag_count_reward/std": 0.16803345270454884, + "step": 222, + "token_counts/after_target": 1652.0, + "token_counts/after_think": 234.25, + "token_counts/before_target": 2220.5, + "token_counts/before_think": 1884.25 + }, + { + "avg_penalty/after_target": 2.5141773223876953, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.38002175837755203, + "avg_penalty/before_think": 0.6507325768470764, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 716.5, + "completions/max_terminated_length": 614.5, + "completions/mean_length": 296.5, + "completions/mean_terminated_length": 285.39271545410156, + "completions/min_length": 84.75, + "completions/min_terminated_length": 84.75, + "epoch": 0.1115, + "grad_norm": 3.2154183387756348, + "kl": 2.52734375, + "learning_rate": 1.999262916410621e-05, + "loss": 0.2568, + "num_tokens": 9069356.0, + "reward": 1.9609375, + "reward_std": 0.15625, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.03125, + "step": 223, + "token_counts/after_target": 710.5, + "token_counts/after_think": 153.5, + "token_counts/before_target": 1742.75, + "token_counts/before_think": 2137.25 + }, + { + "avg_penalty/after_target": 2.323198914527893, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4027725011110306, + "avg_penalty/before_think": 0.6066257655620575, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.25, + "completions/max_terminated_length": 478.25, + "completions/mean_length": 247.34375, + "completions/mean_terminated_length": 247.34375, + "completions/min_length": 81.5, + "completions/min_terminated_length": 81.5, + "epoch": 0.112, + "grad_norm": 1.6588071584701538, + "kl": 0.7666015625, + "learning_rate": 1.9991943951144462e-05, + "loss": 0.0684, + "num_tokens": 9096226.0, + "reward": 1.95703125, + "reward_std": 0.171875, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.046875, + "step": 224, + "token_counts/after_target": 647.0, + "token_counts/after_think": 78.0, + "token_counts/before_target": 1710.25, + "token_counts/before_think": 1522.25 + }, + { + "avg_penalty/after_target": 1.9947303235530853, + "avg_penalty/after_think": 2.7934464812278748, + "avg_penalty/before_target": 0.3853926584124565, + "avg_penalty/before_think": 0.4627606123685837, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 242.1875, + "completions/mean_terminated_length": 242.1875, + "completions/min_length": 89.75, + "completions/min_terminated_length": 89.75, + "epoch": 0.1125, + "grad_norm": 1.2508572340011597, + "kl": 0.5927734375, + "learning_rate": 1.9991228300988586e-05, + "loss": 0.0189, + "num_tokens": 9120494.0, + "reward": 1.9453125, + "reward_std": 0.1748654991388321, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.14789126068353653, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.03125, + "step": 225, + "token_counts/after_target": 458.75, + "token_counts/after_think": 71.25, + "token_counts/before_target": 1748.25, + "token_counts/before_think": 1596.75 + }, + { + "avg_penalty/after_target": 1.9085887372493744, + "avg_penalty/after_think": 3.3702099919319153, + "avg_penalty/before_target": 0.35476119816303253, + "avg_penalty/before_think": 0.550324097275734, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.75, + "completions/max_terminated_length": 491.75, + "completions/mean_length": 235.015625, + "completions/mean_terminated_length": 235.015625, + "completions/min_length": 62.5, + "completions/min_terminated_length": 62.5, + "epoch": 0.113, + "grad_norm": 1.3265419006347656, + "kl": 0.51025390625, + "learning_rate": 1.999048221581858e-05, + "loss": 0.0147, + "num_tokens": 9144815.0, + "reward": 2.0703125, + "reward_std": 0.23723843693733215, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.1280868947505951, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.08539126068353653, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.03125, + "step": 226, + "token_counts/after_target": 402.0, + "token_counts/after_think": 144.25, + "token_counts/before_target": 1062.75, + "token_counts/before_think": 2151.25 + }, + { + "avg_penalty/after_target": 2.475416839122772, + "avg_penalty/after_think": 3.632906973361969, + "avg_penalty/before_target": 0.3227233961224556, + "avg_penalty/before_think": 0.5105072185397148, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 606.5, + "completions/max_terminated_length": 606.5, + "completions/mean_length": 251.65625, + "completions/mean_terminated_length": 251.65625, + "completions/min_length": 85.5, + "completions/min_terminated_length": 85.5, + "epoch": 0.1135, + "grad_norm": 0.19585976004600525, + "kl": 0.4111328125, + "learning_rate": 1.998970569790715e-05, + "loss": 0.0165, + "num_tokens": 9169673.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 227, + "token_counts/after_target": 423.75, + "token_counts/after_think": 174.0, + "token_counts/before_target": 1642.25, + "token_counts/before_think": 1786.5 + }, + { + "avg_penalty/after_target": 2.3881088197231293, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3273790404200554, + "avg_penalty/before_think": 0.573423333466053, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 232.453125, + "completions/mean_terminated_length": 232.453125, + "completions/min_length": 81.25, + "completions/min_terminated_length": 81.25, + "epoch": 0.114, + "grad_norm": 1.4614561796188354, + "kl": 0.4140625, + "learning_rate": 1.9988898749619702e-05, + "loss": 0.0247, + "num_tokens": 9196214.0, + "reward": 1.9921875, + "reward_std": 0.15625, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.03125, + "step": 228, + "token_counts/after_target": 434.0, + "token_counts/after_think": 149.25, + "token_counts/before_target": 1408.0, + "token_counts/before_think": 1728.0 + }, + { + "avg_penalty/after_target": 2.086857259273529, + "avg_penalty/after_think": 3.7587963938713074, + "avg_penalty/before_target": 0.36547380685806274, + "avg_penalty/before_think": 0.6038050130009651, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.25, + "completions/max_terminated_length": 566.25, + "completions/mean_length": 255.34375, + "completions/mean_terminated_length": 255.34375, + "completions/min_length": 104.25, + "completions/min_terminated_length": 104.25, + "epoch": 0.1145, + "grad_norm": 0.7654566168785095, + "kl": 0.412109375, + "learning_rate": 1.9988061373414342e-05, + "loss": 0.0372, + "num_tokens": 9220508.0, + "reward": 2.015625, + "reward_std": 0.0625, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 229, + "token_counts/after_target": 661.25, + "token_counts/after_think": 74.25, + "token_counts/before_target": 1722.5, + "token_counts/before_think": 1627.5 + }, + { + "avg_penalty/after_target": 2.0303778052330017, + "avg_penalty/after_think": 3.707749664783478, + "avg_penalty/before_target": 0.3756023198366165, + "avg_penalty/before_think": 0.5370750650763512, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 658.75, + "completions/max_terminated_length": 549.25, + "completions/mean_length": 257.609375, + "completions/mean_terminated_length": 245.2989616394043, + "completions/min_length": 40.5, + "completions/min_terminated_length": 40.5, + "epoch": 0.115, + "grad_norm": 1.4913971424102783, + "kl": 0.3701171875, + "learning_rate": 1.9987193571841865e-05, + "loss": 0.2193, + "num_tokens": 9247507.0, + "reward": 1.97265625, + "reward_std": 0.109375, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.046875, + "step": 230, + "token_counts/after_target": 620.5, + "token_counts/after_think": 102.5, + "token_counts/before_target": 1613.0, + "token_counts/before_think": 1785.75 + }, + { + "avg_penalty/after_target": 2.6082414090633392, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.292229276150465, + "avg_penalty/before_think": 0.6288503333926201, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.25, + "completions/max_terminated_length": 541.25, + "completions/mean_length": 240.125, + "completions/mean_terminated_length": 240.125, + "completions/min_length": 56.75, + "completions/min_terminated_length": 56.75, + "epoch": 0.1155, + "grad_norm": 0.764899730682373, + "kl": 0.392578125, + "learning_rate": 1.9986295347545738e-05, + "loss": 0.0521, + "num_tokens": 9272891.0, + "reward": 1.984375, + "reward_std": 0.0625, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 231, + "token_counts/after_target": 533.0, + "token_counts/after_think": 364.5, + "token_counts/before_target": 1139.25, + "token_counts/before_think": 1805.25 + }, + { + "avg_penalty/after_target": 1.9128376245498657, + "avg_penalty/after_think": 3.9191306829452515, + "avg_penalty/before_target": 0.4255586713552475, + "avg_penalty/before_think": 0.5923150330781937, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 642.75, + "completions/max_terminated_length": 567.75, + "completions/mean_length": 268.9375, + "completions/mean_terminated_length": 257.03021240234375, + "completions/min_length": 57.25, + "completions/min_terminated_length": 57.25, + "epoch": 0.116, + "grad_norm": 1.1168495416641235, + "kl": 0.37158203125, + "learning_rate": 1.998536670326212e-05, + "loss": 0.1384, + "num_tokens": 9300775.0, + "reward": 1.9609375, + "reward_std": 0.19827844202518463, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.07902991026639938, + "step": 232, + "token_counts/after_target": 618.25, + "token_counts/after_think": 264.0, + "token_counts/before_target": 1082.75, + "token_counts/before_think": 2338.0 + }, + { + "avg_penalty/after_target": 2.4710595905780792, + "avg_penalty/after_think": 3.901153862476349, + "avg_penalty/before_target": 0.3139399066567421, + "avg_penalty/before_think": 0.5164420902729034, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 713.5, + "completions/max_terminated_length": 631.5, + "completions/mean_length": 327.203125, + "completions/mean_terminated_length": 317.4750061035156, + "completions/min_length": 82.5, + "completions/min_terminated_length": 82.5, + "epoch": 0.1165, + "grad_norm": 0.7051181793212891, + "kl": 0.3740234375, + "learning_rate": 1.9984407641819812e-05, + "loss": 0.0978, + "num_tokens": 9331092.0, + "reward": 1.98046875, + "reward_std": 0.078125, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.015625, + "step": 233, + "token_counts/after_target": 676.0, + "token_counts/after_think": 173.25, + "token_counts/before_target": 1436.25, + "token_counts/before_think": 2949.75 + }, + { + "avg_penalty/after_target": 2.012450784444809, + "avg_penalty/after_think": 3.768143355846405, + "avg_penalty/before_target": 0.4142155572772026, + "avg_penalty/before_think": 0.6472651660442352, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 726.75, + "completions/max_terminated_length": 667.25, + "completions/mean_length": 358.375, + "completions/mean_terminated_length": 349.7718811035156, + "completions/min_length": 61.5, + "completions/min_terminated_length": 61.5, + "epoch": 0.117, + "grad_norm": 0.8030740022659302, + "kl": 0.365234375, + "learning_rate": 1.9983418166140286e-05, + "loss": 0.054, + "num_tokens": 9369036.0, + "reward": 1.93359375, + "reward_std": 0.3307717889547348, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.23328252136707306, + "rewards/tag_count_reward/mean": 0.98046875, + "rewards/tag_count_reward/std": 0.058320630341768265, + "step": 234, + "token_counts/after_target": 1202.0, + "token_counts/after_think": 329.0, + "token_counts/before_target": 1434.5, + "token_counts/before_think": 2768.5 + }, + { + "avg_penalty/after_target": 2.3536105155944824, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.40605493634939194, + "avg_penalty/before_think": 0.6171665638685226, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.5, + "completions/max_terminated_length": 700.5, + "completions/mean_length": 301.171875, + "completions/mean_terminated_length": 301.171875, + "completions/min_length": 87.25, + "completions/min_terminated_length": 87.25, + "epoch": 0.1175, + "grad_norm": 0.6381359100341797, + "kl": 0.36376953125, + "learning_rate": 1.9982398279237657e-05, + "loss": 0.0303, + "num_tokens": 9397223.0, + "reward": 1.984375, + "reward_std": 0.0625, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 235, + "token_counts/after_target": 848.5, + "token_counts/after_think": 99.25, + "token_counts/before_target": 1949.75, + "token_counts/before_think": 1921.25 + }, + { + "avg_penalty/after_target": 2.9079185724258423, + "avg_penalty/after_think": 3.6676175594329834, + "avg_penalty/before_target": 0.33596350997686386, + "avg_penalty/before_think": 0.5344427973031998, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 657.75, + "completions/max_terminated_length": 657.75, + "completions/mean_length": 269.171875, + "completions/mean_terminated_length": 269.171875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.118, + "grad_norm": 0.6494417190551758, + "kl": 0.35302734375, + "learning_rate": 1.998134798421867e-05, + "loss": 0.0075, + "num_tokens": 9422786.0, + "reward": 1.98046875, + "reward_std": 0.078125, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.99609375, + "rewards/tag_count_reward/std": 0.015625, + "step": 236, + "token_counts/after_target": 742.25, + "token_counts/after_think": 65.75, + "token_counts/before_target": 1757.0, + "token_counts/before_think": 1741.75 + }, + { + "avg_penalty/after_target": 2.5439202189445496, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.34581881016492844, + "avg_penalty/before_think": 0.4951043352484703, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 570.25, + "completions/max_terminated_length": 570.25, + "completions/mean_length": 285.640625, + "completions/mean_terminated_length": 285.640625, + "completions/min_length": 75.75, + "completions/min_terminated_length": 75.75, + "epoch": 0.1185, + "grad_norm": 0.9289249181747437, + "kl": 0.34521484375, + "learning_rate": 1.9980267284282718e-05, + "loss": 0.0404, + "num_tokens": 9450219.0, + "reward": 1.95703125, + "reward_std": 0.15651005506515503, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.033994100987911224, + "step": 237, + "token_counts/after_target": 690.75, + "token_counts/after_think": 166.75, + "token_counts/before_target": 2046.75, + "token_counts/before_think": 1666.0 + }, + { + "avg_penalty/after_target": 2.0265009999275208, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.43730689585208893, + "avg_penalty/before_think": 0.7787294089794159, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 769.0, + "completions/max_terminated_length": 769.0, + "completions/mean_length": 351.46875, + "completions/mean_terminated_length": 351.46875, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.119, + "grad_norm": 0.5502173900604248, + "kl": 0.3046875, + "learning_rate": 1.997915618272179e-05, + "loss": 0.0085, + "num_tokens": 9484153.0, + "reward": 1.984375, + "reward_std": 0.0625, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 238, + "token_counts/after_target": 1163.5, + "token_counts/after_think": 534.5, + "token_counts/before_target": 1723.5, + "token_counts/before_think": 2202.0 + }, + { + "avg_penalty/after_target": 2.291272819042206, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4300369396805763, + "avg_penalty/before_think": 0.5747007876634598, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.5, + "completions/max_terminated_length": 646.5, + "completions/mean_length": 326.734375, + "completions/mean_terminated_length": 326.734375, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.1195, + "grad_norm": 0.6527411341667175, + "kl": 0.328125, + "learning_rate": 1.9978014682920503e-05, + "loss": 0.0324, + "num_tokens": 9516792.0, + "reward": 1.984375, + "reward_std": 0.0625, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 239, + "token_counts/after_target": 899.75, + "token_counts/after_think": 292.75, + "token_counts/before_target": 1692.25, + "token_counts/before_think": 2343.0 + }, + { + "avg_penalty/after_target": 2.3188424706459045, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.42886439710855484, + "avg_penalty/before_think": 0.638686329126358, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 816.5, + "completions/max_terminated_length": 816.5, + "completions/mean_length": 398.234375, + "completions/mean_terminated_length": 398.234375, + "completions/min_length": 99.75, + "completions/min_terminated_length": 99.75, + "epoch": 0.12, + "grad_norm": 0.17037880420684814, + "kl": 0.3271484375, + "learning_rate": 1.9976842788356054e-05, + "loss": 0.0131, + "num_tokens": 9552375.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 240, + "token_counts/after_target": 1137.0, + "token_counts/after_think": 431.25, + "token_counts/before_target": 1746.5, + "token_counts/before_think": 3057.0 + }, + { + "avg_penalty/after_target": 2.3626803755760193, + "avg_penalty/after_think": 3.9756441712379456, + "avg_penalty/before_target": 0.4897584393620491, + "avg_penalty/before_think": 0.5930778980255127, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.25, + "completions/max_terminated_length": 744.25, + "completions/mean_length": 340.578125, + "completions/mean_terminated_length": 340.578125, + "completions/min_length": 36.5, + "completions/min_terminated_length": 36.5, + "epoch": 0.1205, + "grad_norm": 1.2201557159423828, + "kl": 0.3671875, + "learning_rate": 1.9975640502598243e-05, + "loss": 0.0493, + "num_tokens": 9584892.0, + "reward": 1.8984375, + "reward_std": 0.29934319853782654, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.23328252136707306, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.06976010836660862, + "step": 241, + "token_counts/after_target": 623.25, + "token_counts/after_think": 474.5, + "token_counts/before_target": 1312.0, + "token_counts/before_think": 3039.5 + }, + { + "avg_penalty/after_target": 1.6894586682319641, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5215241760015488, + "avg_penalty/before_think": 0.7361392378807068, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 745.25, + "completions/max_terminated_length": 691.75, + "completions/mean_length": 407.328125, + "completions/mean_terminated_length": 399.27396392822266, + "completions/min_length": 90.5, + "completions/min_terminated_length": 90.5, + "epoch": 0.121, + "grad_norm": 0.8723154067993164, + "kl": 0.306640625, + "learning_rate": 1.9974407829309442e-05, + "loss": 0.0816, + "num_tokens": 9622241.0, + "reward": 1.94140625, + "reward_std": 0.19476625323295593, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.14789126068353653, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.046875, + "step": 242, + "token_counts/after_target": 1226.5, + "token_counts/after_think": 526.0, + "token_counts/before_target": 1786.25, + "token_counts/before_think": 2978.5 + }, + { + "avg_penalty/after_target": 2.504943937063217, + "avg_penalty/after_think": 2.9847683906555176, + "avg_penalty/before_target": 0.5119561180472374, + "avg_penalty/before_think": 0.6111245527863503, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 692.25, + "completions/max_terminated_length": 692.25, + "completions/mean_length": 403.71875, + "completions/mean_terminated_length": 403.71875, + "completions/min_length": 129.75, + "completions/min_terminated_length": 129.75, + "epoch": 0.1215, + "grad_norm": 1.0915601253509521, + "kl": 0.33447265625, + "learning_rate": 1.997314477224458e-05, + "loss": 0.0317, + "num_tokens": 9656735.0, + "reward": 1.4453125, + "reward_std": 0.731791764497757, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.48989029973745346, + "rewards/tag_count_reward/mean": 0.8359375, + "rewards/tag_count_reward/std": 0.29897016659379005, + "step": 243, + "token_counts/after_target": 1649.25, + "token_counts/after_think": 154.25, + "token_counts/before_target": 2754.5, + "token_counts/before_think": 1901.5 + }, + { + "avg_penalty/after_target": 2.3226024210453033, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.6050432473421097, + "avg_penalty/before_think": 0.6262252628803253, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 928.5, + "completions/max_terminated_length": 807.25, + "completions/mean_length": 468.328125, + "completions/mean_terminated_length": 425.00164794921875, + "completions/min_length": 167.75, + "completions/min_terminated_length": 167.75, + "epoch": 0.122, + "grad_norm": 7.088492393493652, + "kl": 0.923828125, + "learning_rate": 1.9971851335251162e-05, + "loss": 0.3015, + "num_tokens": 9696740.0, + "reward": 1.4140625, + "reward_std": 0.7836328595876694, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.500852182507515, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.3347511813044548, + "step": 244, + "token_counts/after_target": 2076.5, + "token_counts/after_think": 350.25, + "token_counts/before_target": 2986.0, + "token_counts/before_think": 2080.5 + }, + { + "avg_penalty/after_target": 2.4575947523117065, + "avg_penalty/after_think": 2.9239264726638794, + "avg_penalty/before_target": 0.9319668859243393, + "avg_penalty/before_think": 0.9952702224254608, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 533.0, + "completions/mean_length": 623.375, + "completions/mean_terminated_length": 338.42501068115234, + "completions/min_length": 132.5, + "completions/min_terminated_length": 132.5, + "epoch": 0.1225, + "grad_norm": 26.41432762145996, + "kl": 16.859375, + "learning_rate": 1.9970527522269204e-05, + "loss": 1.045, + "num_tokens": 9748540.0, + "reward": 0.890625, + "reward_std": 0.8674248158931732, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.48456869274377823, + "rewards/tag_count_reward/mean": 0.546875, + "rewards/tag_count_reward/std": 0.4513557106256485, + "step": 245, + "token_counts/after_target": 4596.0, + "token_counts/after_think": 80.5, + "token_counts/before_target": 3862.75, + "token_counts/before_think": 1434.75 + }, + { + "avg_penalty/after_target": 2.7529065012931824, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 1.1501492410898209, + "avg_penalty/before_think": 1.0579592660069466, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 281.5, + "completions/mean_length": 852.890625, + "completions/mean_terminated_length": 199.71875, + "completions/min_length": 87.75, + "completions/min_terminated_length": 87.75, + "epoch": 0.123, + "grad_norm": 77.33348083496094, + "kl": 34.03125, + "learning_rate": 1.9969173337331283e-05, + "loss": 1.6081, + "num_tokens": 9814181.0, + "reward": 0.3671875, + "reward_std": 0.5523828640580177, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.109375, + "rewards/format_reward/std": 0.24467839300632477, + "rewards/tag_count_reward/mean": 0.2578125, + "rewards/tag_count_reward/std": 0.3467531129717827, + "step": 246, + "token_counts/after_target": 7458.5, + "token_counts/after_think": 86.5, + "token_counts/before_target": 5558.75, + "token_counts/before_think": 542.5 + }, + { + "avg_penalty/after_target": 2.5170300006866455, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 1.2447095215320587, + "avg_penalty/before_think": 1.3384212404489517, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 450.75, + "completions/mean_length": 889.109375, + "completions/mean_terminated_length": 352.0416679382324, + "completions/min_length": 233.25, + "completions/min_terminated_length": 233.25, + "epoch": 0.1235, + "grad_norm": 47.45655822753906, + "kl": 19.78125, + "learning_rate": 1.9967788784562474e-05, + "loss": 0.9425, + "num_tokens": 9880012.0, + "reward": 0.359375, + "reward_std": 0.4654877558350563, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.1632782220840454, + "rewards/tag_count_reward/mean": 0.296875, + "rewards/tag_count_reward/std": 0.33732734620571136, + "step": 247, + "token_counts/after_target": 7511.75, + "token_counts/after_think": 184.25, + "token_counts/before_target": 5682.75, + "token_counts/before_think": 847.0 + }, + { + "avg_penalty/after_target": 2.7064287662506104, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 1.3117656409740448, + "avg_penalty/before_think": 1.1266257017850876, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.734375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 713.0, + "completions/mean_length": 861.296875, + "completions/mean_terminated_length": 406.0625, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.124, + "grad_norm": 6.86893892288208, + "kl": 3.1484375, + "learning_rate": 1.9966373868180367e-05, + "loss": 0.3472, + "num_tokens": 9945583.0, + "reward": 0.45703125, + "reward_std": 0.6213441342115402, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.109375, + "rewards/format_reward/std": 0.31116948276758194, + "rewards/tag_count_reward/mean": 0.34765625, + "rewards/tag_count_reward/std": 0.38069846481084824, + "step": 248, + "token_counts/after_target": 7572.0, + "token_counts/after_think": 143.5, + "token_counts/before_target": 5088.0, + "token_counts/before_think": 977.25 + }, + { + "avg_penalty/after_target": 2.6758301854133606, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 1.2503492534160614, + "avg_penalty/before_think": 1.0966725423932076, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 506.75, + "completions/mean_length": 804.71875, + "completions/mean_terminated_length": 283.1916732788086, + "completions/min_length": 121.25, + "completions/min_terminated_length": 121.25, + "epoch": 0.1245, + "grad_norm": 16.4239501953125, + "kl": 3.65625, + "learning_rate": 1.9964928592495046e-05, + "loss": 0.4884, + "num_tokens": 10005165.0, + "reward": 0.65625, + "reward_std": 0.8689076006412506, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.296875, + "rewards/format_reward/std": 0.46034691482782364, + "rewards/tag_count_reward/mean": 0.359375, + "rewards/tag_count_reward/std": 0.4290317967534065, + "step": 249, + "token_counts/after_target": 7140.75, + "token_counts/after_think": 28.25, + "token_counts/before_target": 5133.0, + "token_counts/before_think": 573.5 + }, + { + "avg_penalty/after_target": 2.62486469745636, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 1.258875623345375, + "avg_penalty/before_think": 1.0965843945741653, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 452.5, + "completions/mean_length": 906.453125, + "completions/mean_terminated_length": 358.0208435058594, + "completions/min_length": 439.75, + "completions/min_terminated_length": 183.75, + "epoch": 0.125, + "grad_norm": 36.25382995605469, + "kl": 6.03125, + "learning_rate": 1.9963452961909065e-05, + "loss": 0.7626, + "num_tokens": 10075786.0, + "reward": 0.421875, + "reward_std": 0.5347139909863472, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.22360680997371674, + "rewards/tag_count_reward/mean": 0.296875, + "rewards/tag_count_reward/std": 0.33475418388843536, + "step": 250, + "token_counts/after_target": 7811.5, + "token_counts/after_think": 47.25, + "token_counts/before_target": 5417.5, + "token_counts/before_think": 1227.0 + }, + { + "avg_penalty/after_target": 2.725254714488983, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 1.2240894436836243, + "avg_penalty/before_think": 0.8126622438430786, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 795.75, + "completions/mean_length": 777.546875, + "completions/mean_terminated_length": 460.6785888671875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.1255, + "grad_norm": 74.1899185180664, + "kl": 9.765625, + "learning_rate": 1.9961946980917457e-05, + "loss": 2.0079, + "num_tokens": 10136141.0, + "reward": 0.6015625, + "reward_std": 0.703405573964119, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.171875, + "rewards/format_reward/std": 0.3758598491549492, + "rewards/tag_count_reward/mean": 0.4296875, + "rewards/tag_count_reward/std": 0.39428601413965225, + "step": 251, + "token_counts/after_target": 6374.75, + "token_counts/after_think": 0.0, + "token_counts/before_target": 3901.75, + "token_counts/before_think": 2164.25 + }, + { + "avg_penalty/after_target": 2.654365658760071, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 1.2101339995861053, + "avg_penalty/before_think": 1.2124089896678925, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 523.5, + "completions/mean_length": 854.90625, + "completions/mean_terminated_length": 319.7291679382324, + "completions/min_length": 125.75, + "completions/min_terminated_length": 125.75, + "epoch": 0.126, + "grad_norm": 36.92155075073242, + "kl": 19.75, + "learning_rate": 1.99604106541077e-05, + "loss": 2.4124, + "num_tokens": 10201303.0, + "reward": 0.3828125, + "reward_std": 0.6358186900615692, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.109375, + "rewards/format_reward/std": 0.31116948276758194, + "rewards/tag_count_reward/mean": 0.2734375, + "rewards/tag_count_reward/std": 0.37672392278909683, + "step": 252, + "token_counts/after_target": 7310.75, + "token_counts/after_think": 19.75, + "token_counts/before_target": 5483.0, + "token_counts/before_think": 865.0 + }, + { + "avg_penalty/after_target": 2.6300604343414307, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 1.2320248782634735, + "avg_penalty/before_think": 0.9449933022260666, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 427.5, + "completions/mean_length": 855.96875, + "completions/mean_terminated_length": 322.13750076293945, + "completions/min_length": 222.75, + "completions/min_terminated_length": 222.75, + "epoch": 0.1265, + "grad_norm": 61.66041564941406, + "kl": 42.5, + "learning_rate": 1.9958843986159705e-05, + "loss": 3.2896, + "num_tokens": 10267157.0, + "reward": 0.31640625, + "reward_std": 0.5291812717914581, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.27289126068353653, + "rewards/tag_count_reward/mean": 0.23828125, + "rewards/tag_count_reward/std": 0.3060266673564911, + "step": 253, + "token_counts/after_target": 7426.75, + "token_counts/after_think": 125.25, + "token_counts/before_target": 5579.5, + "token_counts/before_think": 564.0 + }, + { + "avg_penalty/after_target": 2.619847893714905, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 1.2545706927776337, + "avg_penalty/before_think": 0.9973493665456772, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 934.265625, + "completions/mean_terminated_length": 231.0208339691162, + "completions/min_length": 397.25, + "completions/min_terminated_length": 141.25, + "epoch": 0.127, + "grad_norm": 82.0105972290039, + "kl": 52.8125, + "learning_rate": 1.9957246981845825e-05, + "loss": 3.5804, + "num_tokens": 10339030.0, + "reward": 0.21875, + "reward_std": 0.4220023788511753, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.17078252136707306, + "rewards/tag_count_reward/mean": 0.15625, + "rewards/tag_count_reward/std": 0.2721790224313736, + "step": 254, + "token_counts/after_target": 8267.5, + "token_counts/after_think": 88.25, + "token_counts/before_target": 6159.25, + "token_counts/before_think": 433.25 + }, + { + "avg_penalty/after_target": 2.711149215698242, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 1.1932114660739899, + "avg_penalty/before_think": 1.294352874159813, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 461.75, + "completions/mean_length": 824.34375, + "completions/mean_terminated_length": 235.03810119628906, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.1275, + "grad_norm": 7.7046098709106445, + "kl": 34.6875, + "learning_rate": 1.99556196460308e-05, + "loss": 3.0074, + "num_tokens": 10406892.0, + "reward": 0.3046875, + "reward_std": 0.4829505681991577, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.17078252136707306, + "rewards/tag_count_reward/mean": 0.2421875, + "rewards/tag_count_reward/std": 0.3313433527946472, + "step": 255, + "token_counts/after_target": 7304.0, + "token_counts/after_think": 3.25, + "token_counts/before_target": 5254.5, + "token_counts/before_think": 627.75 + }, + { + "avg_penalty/after_target": 2.686732828617096, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 1.3342198431491852, + "avg_penalty/before_think": 1.3856259286403656, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.84375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 162.5, + "completions/mean_length": 887.078125, + "completions/mean_terminated_length": 152.45000076293945, + "completions/min_length": 140.25, + "completions/min_terminated_length": 140.25, + "epoch": 0.128, + "grad_norm": 26.911882400512695, + "kl": 25.0, + "learning_rate": 1.9953961983671792e-05, + "loss": 2.6153, + "num_tokens": 10478913.0, + "reward": 0.25390625, + "reward_std": 0.41723354905843735, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.22265625, + "rewards/tag_count_reward/std": 0.32459961995482445, + "step": 256, + "token_counts/after_target": 7671.75, + "token_counts/after_think": 152.0, + "token_counts/before_target": 5168.25, + "token_counts/before_think": 1201.25 + }, + { + "avg_penalty/after_target": 2.8851694464683533, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 1.2791852056980133, + "avg_penalty/before_think": 1.2998596131801605, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 393.75, + "completions/mean_length": 741.703125, + "completions/mean_terminated_length": 233.57777786254883, + "completions/min_length": 92.75, + "completions/min_terminated_length": 92.75, + "epoch": 0.1285, + "grad_norm": 16.584562301635742, + "kl": 33.28125, + "learning_rate": 1.9952273999818312e-05, + "loss": 3.1596, + "num_tokens": 10536750.0, + "reward": 0.328125, + "reward_std": 0.44992633908987045, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.1875, + "rewards/tag_count_reward/mean": 0.28125, + "rewards/tag_count_reward/std": 0.3171077221632004, + "step": 257, + "token_counts/after_target": 6945.0, + "token_counts/after_think": 11.0, + "token_counts/before_target": 3880.0, + "token_counts/before_think": 1031.25 + }, + { + "avg_penalty/after_target": 2.724967062473297, + "avg_penalty/after_think": 2.617360472679138, + "avg_penalty/before_target": 1.081670567393303, + "avg_penalty/before_think": 0.7065747678279877, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 423.75, + "completions/mean_length": 549.234375, + "completions/mean_terminated_length": 174.89643096923828, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.129, + "grad_norm": 58.054588317871094, + "kl": 50.875, + "learning_rate": 1.9950555699612265e-05, + "loss": 3.6254, + "num_tokens": 10580205.0, + "reward": 0.41015625, + "reward_std": 0.6283693760633469, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.109375, + "rewards/format_reward/std": 0.3186737820506096, + "rewards/tag_count_reward/mean": 0.30078125, + "rewards/tag_count_reward/std": 0.3675200790166855, + "step": 258, + "token_counts/after_target": 4270.75, + "token_counts/after_think": 70.25, + "token_counts/before_target": 3607.25, + "token_counts/before_think": 839.5 + }, + { + "avg_penalty/after_target": 2.876316726207733, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 1.0988241136074066, + "avg_penalty/before_think": 1.0130911618471146, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 493.25, + "completions/mean_length": 598.578125, + "completions/mean_terminated_length": 210.18561553955078, + "completions/min_length": 37.75, + "completions/min_terminated_length": 37.75, + "epoch": 0.1295, + "grad_norm": 58.20395278930664, + "kl": 47.375, + "learning_rate": 1.9948807088287884e-05, + "loss": 3.5814, + "num_tokens": 10627602.0, + "reward": 0.43359375, + "reward_std": 0.5601033344864845, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.109375, + "rewards/format_reward/std": 0.2596946656703949, + "rewards/tag_count_reward/mean": 0.32421875, + "rewards/tag_count_reward/std": 0.3480750247836113, + "step": 259, + "token_counts/after_target": 5122.0, + "token_counts/after_think": 53.5, + "token_counts/before_target": 3466.75, + "token_counts/before_think": 935.0 + }, + { + "avg_penalty/after_target": 2.8190863728523254, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 1.3089715242385864, + "avg_penalty/before_think": 0.32022634893655777, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 491.5, + "completions/mean_length": 726.921875, + "completions/mean_terminated_length": 235.20138931274414, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.13, + "grad_norm": 40.18400955200195, + "kl": 42.3125, + "learning_rate": 1.9947028171171742e-05, + "loss": 3.4145, + "num_tokens": 10681597.0, + "reward": 0.45703125, + "reward_std": 0.6977882385253906, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.15625, + "rewards/format_reward/std": 0.36483466625213623, + "rewards/tag_count_reward/mean": 0.30078125, + "rewards/tag_count_reward/std": 0.36734161525964737, + "step": 260, + "token_counts/after_target": 6334.75, + "token_counts/after_think": 14.75, + "token_counts/before_target": 4482.5, + "token_counts/before_think": 798.75 + }, + { + "avg_penalty/after_target": 2.6476643085479736, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 1.3236280679702759, + "avg_penalty/before_think": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1305, + "grad_norm": 20.87360954284668, + "kl": 27.90625, + "learning_rate": 1.9945218953682736e-05, + "loss": 2.5809, + "num_tokens": 10757837.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 261, + "token_counts/after_target": 9503.0, + "token_counts/after_think": 0.0, + "token_counts/before_target": 6881.0, + "token_counts/before_think": 0.0 + }, + { + "avg_penalty/after_target": 2.655242443084717, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 1.3239622712135315, + "avg_penalty/before_think": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1.75, + "completions/mean_length": 992.171875, + "completions/mean_terminated_length": 1.375, + "completions/min_length": 769.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.131, + "grad_norm": 27.475799560546875, + "kl": 24.1875, + "learning_rate": 1.994337944133205e-05, + "loss": 2.4371, + "num_tokens": 10832024.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "step": 262, + "token_counts/after_target": 9250.75, + "token_counts/after_think": 0.0, + "token_counts/before_target": 6624.0, + "token_counts/before_think": 0.0 + }, + { + "avg_penalty/after_target": 2.7437276244163513, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 1.22971111536026, + "avg_penalty/before_think": 1.1115635931491852, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 881.421875, + "completions/mean_terminated_length": 158.64999961853027, + "completions/min_length": 327.75, + "completions/min_terminated_length": 71.75, + "epoch": 0.1315, + "grad_norm": 41.3861198425293, + "kl": 24.53125, + "learning_rate": 1.9941509639723155e-05, + "loss": 2.6891, + "num_tokens": 10899267.0, + "reward": 0.1953125, + "reward_std": 0.37760260701179504, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.1640625, + "rewards/tag_count_reward/std": 0.28207893669605255, + "step": 263, + "token_counts/after_target": 7878.5, + "token_counts/after_think": 0.0, + "token_counts/before_target": 5658.5, + "token_counts/before_think": 565.75 + }, + { + "avg_penalty/after_target": 2.637008547782898, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 1.2794127464294434, + "avg_penalty/before_think": 0.8136550933122635, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 883.09375, + "completions/mean_terminated_length": 191.2750015258789, + "completions/min_length": 95.25, + "completions/min_terminated_length": 95.25, + "epoch": 0.132, + "grad_norm": 22.258848190307617, + "kl": 43.75, + "learning_rate": 1.99396095545518e-05, + "loss": 3.4217, + "num_tokens": 10965961.0, + "reward": 0.203125, + "reward_std": 0.4478817656636238, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.1875, + "rewards/tag_count_reward/mean": 0.15625, + "rewards/tag_count_reward/std": 0.2839956134557724, + "step": 264, + "token_counts/after_target": 8036.75, + "token_counts/after_think": 0.0, + "token_counts/before_target": 5679.25, + "token_counts/before_think": 413.5 + }, + { + "avg_penalty/after_target": 2.752480208873749, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 1.275103509426117, + "avg_penalty/before_think": 0.9951149374246597, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 343.5, + "completions/mean_length": 752.703125, + "completions/mean_terminated_length": 200.86111068725586, + "completions/min_length": 65.5, + "completions/min_terminated_length": 65.5, + "epoch": 0.1325, + "grad_norm": 14.239786148071289, + "kl": 43.3125, + "learning_rate": 1.9937679191605964e-05, + "loss": 3.4942, + "num_tokens": 11022966.0, + "reward": 0.40625, + "reward_std": 0.6419344842433929, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.33406074345111847, + "rewards/tag_count_reward/mean": 0.28125, + "rewards/tag_count_reward/std": 0.3660692498087883, + "step": 265, + "token_counts/after_target": 6697.0, + "token_counts/after_think": 18.25, + "token_counts/before_target": 4864.5, + "token_counts/before_think": 463.5 + }, + { + "avg_penalty/after_target": 2.4207857847213745, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.8369730859994888, + "avg_penalty/before_think": 0.6062517464160919, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 633.0, + "completions/mean_length": 498.84375, + "completions/mean_terminated_length": 355.08349609375, + "completions/min_length": 171.75, + "completions/min_terminated_length": 171.75, + "epoch": 0.133, + "grad_norm": 21.91700553894043, + "kl": 19.390625, + "learning_rate": 1.9935718556765878e-05, + "loss": 2.0085, + "num_tokens": 11066652.0, + "reward": 1.16796875, + "reward_std": 0.8051015585660934, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.515625, + "rewards/format_reward/std": 0.500852182507515, + "rewards/tag_count_reward/mean": 0.65234375, + "rewards/tag_count_reward/std": 0.37499523907899857, + "step": 266, + "token_counts/after_target": 2815.0, + "token_counts/after_think": 226.25, + "token_counts/before_target": 3424.5, + "token_counts/before_think": 1515.75 + }, + { + "avg_penalty/after_target": 2.4714253544807434, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.6845817565917969, + "avg_penalty/before_think": 0.6741690337657928, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 894.5, + "completions/max_terminated_length": 611.75, + "completions/mean_length": 436.578125, + "completions/mean_terminated_length": 340.9130096435547, + "completions/min_length": 56.25, + "completions/min_terminated_length": 56.25, + "epoch": 0.1335, + "grad_norm": 13.463183403015137, + "kl": 19.265625, + "learning_rate": 1.9933727656003964e-05, + "loss": 1.7869, + "num_tokens": 11105809.0, + "reward": 1.0078125, + "reward_std": 0.7320293486118317, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.359375, + "rewards/format_reward/std": 0.4550696536898613, + "rewards/tag_count_reward/mean": 0.6484375, + "rewards/tag_count_reward/std": 0.3654973953962326, + "step": 267, + "token_counts/after_target": 2265.5, + "token_counts/after_think": 209.0, + "token_counts/before_target": 3102.75, + "token_counts/before_think": 1408.0 + }, + { + "avg_penalty/after_target": 2.53296235203743, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.40023260563611984, + "avg_penalty/before_think": 0.7780047357082367, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 942.75, + "completions/max_terminated_length": 679.75, + "completions/mean_length": 403.890625, + "completions/mean_terminated_length": 371.7291793823242, + "completions/min_length": 72.25, + "completions/min_terminated_length": 72.25, + "epoch": 0.134, + "grad_norm": 13.544079780578613, + "kl": 13.55859375, + "learning_rate": 1.9931706495384865e-05, + "loss": 1.2606, + "num_tokens": 11140602.0, + "reward": 1.15625, + "reward_std": 0.8008064031600952, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.4375, + "rewards/format_reward/std": 0.50393907725811, + "rewards/tag_count_reward/mean": 0.71875, + "rewards/tag_count_reward/std": 0.3645177371799946, + "step": 268, + "token_counts/after_target": 1671.75, + "token_counts/after_think": 96.0, + "token_counts/before_target": 2728.5, + "token_counts/before_think": 1966.0 + }, + { + "avg_penalty/after_target": 2.6336841583251953, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.453223392367363, + "avg_penalty/before_think": 0.538188248872757, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 735.0, + "completions/max_terminated_length": 485.25, + "completions/mean_length": 270.921875, + "completions/mean_terminated_length": 246.49063110351562, + "completions/min_length": 83.5, + "completions/min_terminated_length": 83.5, + "epoch": 0.1345, + "grad_norm": 16.85172462463379, + "kl": 19.54296875, + "learning_rate": 1.992965508106537e-05, + "loss": 1.5511, + "num_tokens": 11167429.0, + "reward": 1.38671875, + "reward_std": 0.7612245380878448, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.5071863383054733, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.31484606117010117, + "step": 269, + "token_counts/after_target": 990.0, + "token_counts/after_think": 57.25, + "token_counts/before_target": 2077.0, + "token_counts/before_think": 1210.5 + }, + { + "avg_penalty/after_target": 2.621288388967514, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.45011113956570625, + "avg_penalty/before_think": 0.5433842837810516, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 802.25, + "completions/max_terminated_length": 604.25, + "completions/mean_length": 262.40625, + "completions/mean_terminated_length": 211.46081924438477, + "completions/min_length": 46.75, + "completions/min_terminated_length": 46.75, + "epoch": 0.135, + "grad_norm": 48.16564178466797, + "kl": 37.21875, + "learning_rate": 1.9927573419294456e-05, + "loss": 2.4467, + "num_tokens": 11193423.0, + "reward": 0.96875, + "reward_std": 0.8226608783006668, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.359375, + "rewards/format_reward/std": 0.49244368076324463, + "rewards/tag_count_reward/mean": 0.609375, + "rewards/tag_count_reward/std": 0.3879391700029373, + "step": 270, + "token_counts/after_target": 1202.25, + "token_counts/after_think": 89.25, + "token_counts/before_target": 2030.0, + "token_counts/before_think": 877.0 + }, + { + "avg_penalty/after_target": 2.3164624869823456, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.6300476044416428, + "avg_penalty/before_think": 0.761708214879036, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 539.5, + "completions/mean_length": 334.390625, + "completions/mean_terminated_length": 250.45285034179688, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.1355, + "grad_norm": 30.06278419494629, + "kl": 33.40625, + "learning_rate": 1.9925461516413224e-05, + "loss": 2.4495, + "num_tokens": 11223384.0, + "reward": 1.109375, + "reward_std": 0.8464330285787582, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.453125, + "rewards/format_reward/std": 0.48989029973745346, + "rewards/tag_count_reward/mean": 0.640625, + "rewards/tag_count_reward/std": 0.3881410285830498, + "step": 271, + "token_counts/after_target": 1549.25, + "token_counts/after_think": 170.75, + "token_counts/before_target": 2631.5, + "token_counts/before_think": 998.75 + }, + { + "avg_penalty/after_target": 2.2395834624767303, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.6318114846944809, + "avg_penalty/before_think": 0.8294173777103424, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 725.5, + "completions/mean_length": 375.421875, + "completions/mean_terminated_length": 269.7124710083008, + "completions/min_length": 71.5, + "completions/min_terminated_length": 71.5, + "epoch": 0.136, + "grad_norm": 9.959059715270996, + "kl": 25.1875, + "learning_rate": 1.9923319378854888e-05, + "loss": 2.1108, + "num_tokens": 11257955.0, + "reward": 0.91015625, + "reward_std": 0.8302068263292313, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.328125, + "rewards/format_reward/std": 0.479247085750103, + "rewards/tag_count_reward/mean": 0.58203125, + "rewards/tag_count_reward/std": 0.4200195223093033, + "step": 272, + "token_counts/after_target": 1904.0, + "token_counts/after_think": 108.0, + "token_counts/before_target": 3229.75, + "token_counts/before_think": 765.0 + }, + { + "avg_penalty/after_target": 2.086013972759247, + "avg_penalty/after_think": 3.2165152728557587, + "avg_penalty/before_target": 0.6989402770996094, + "avg_penalty/before_think": 0.9616588354110718, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 1007.75, + "completions/max_terminated_length": 957.5, + "completions/mean_length": 469.390625, + "completions/mean_terminated_length": 374.9233169555664, + "completions/min_length": 103.75, + "completions/min_terminated_length": 103.75, + "epoch": 0.1365, + "grad_norm": 30.359189987182617, + "kl": 13.5390625, + "learning_rate": 1.9921147013144782e-05, + "loss": 1.7289, + "num_tokens": 11298044.0, + "reward": 1.1015625, + "reward_std": 0.8366825133562088, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.453125, + "rewards/format_reward/std": 0.500852182507515, + "rewards/tag_count_reward/mean": 0.6484375, + "rewards/tag_count_reward/std": 0.3809707835316658, + "step": 273, + "token_counts/after_target": 2703.75, + "token_counts/after_think": 106.5, + "token_counts/before_target": 3145.0, + "token_counts/before_think": 1555.0 + }, + { + "avg_penalty/after_target": 2.6195348501205444, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.9393737018108368, + "avg_penalty/before_think": 0.6834639012813568, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 759.5, + "completions/mean_length": 597.390625, + "completions/mean_terminated_length": 377.125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.137, + "grad_norm": 51.010536193847656, + "kl": 17.359375, + "learning_rate": 1.99189444259003e-05, + "loss": 2.2215, + "num_tokens": 11346389.0, + "reward": 0.875, + "reward_std": 0.9050257056951523, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.359375, + "rewards/format_reward/std": 0.48989029973745346, + "rewards/tag_count_reward/mean": 0.515625, + "rewards/tag_count_reward/std": 0.46375224739313126, + "step": 274, + "token_counts/after_target": 4084.75, + "token_counts/after_think": 59.0, + "token_counts/before_target": 4226.0, + "token_counts/before_think": 1188.5 + }, + { + "avg_penalty/after_target": 2.5461562871932983, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.9889370650053024, + "avg_penalty/before_think": 1.3098607808351517, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 777.25, + "completions/mean_length": 684.0, + "completions/mean_terminated_length": 424.9404830932617, + "completions/min_length": 77.5, + "completions/min_terminated_length": 77.5, + "epoch": 0.1375, + "grad_norm": 12.251191139221191, + "kl": 27.90625, + "learning_rate": 1.9916711623830904e-05, + "loss": 2.4687, + "num_tokens": 11401029.0, + "reward": 0.59765625, + "reward_std": 0.7040480002760887, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.21875, + "rewards/format_reward/std": 0.3155868947505951, + "rewards/tag_count_reward/mean": 0.36328125, + "rewards/tag_count_reward/std": 0.40022093057632446, + "step": 275, + "token_counts/after_target": 5177.75, + "token_counts/after_think": 128.5, + "token_counts/before_target": 4530.25, + "token_counts/before_think": 1107.5 + }, + { + "avg_penalty/after_target": 2.5602476596832275, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 1.033720001578331, + "avg_penalty/before_think": 0.9322051256895065, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.453125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 852.75, + "completions/mean_length": 698.921875, + "completions/mean_terminated_length": 435.67708587646484, + "completions/min_length": 125.75, + "completions/min_terminated_length": 125.75, + "epoch": 0.138, + "grad_norm": 36.47609329223633, + "kl": 45.1875, + "learning_rate": 1.9914448613738107e-05, + "loss": 3.2569, + "num_tokens": 11453440.0, + "reward": 0.48046875, + "reward_std": 0.7462364733219147, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.171875, + "rewards/format_reward/std": 0.37149807065725327, + "rewards/tag_count_reward/mean": 0.30859375, + "rewards/tag_count_reward/std": 0.43049704283475876, + "step": 276, + "token_counts/after_target": 5312.5, + "token_counts/after_think": 52.5, + "token_counts/before_target": 4856.5, + "token_counts/before_think": 961.25 + }, + { + "avg_penalty/after_target": 2.443041145801544, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.9444619417190552, + "avg_penalty/before_think": 1.3433901071548462, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.484375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 800.75, + "completions/mean_length": 704.765625, + "completions/mean_terminated_length": 402.9548645019531, + "completions/min_length": 60.25, + "completions/min_terminated_length": 60.25, + "epoch": 0.1385, + "grad_norm": 49.4239501953125, + "kl": 45.3125, + "learning_rate": 1.991215540251542e-05, + "loss": 3.0987, + "num_tokens": 11508689.0, + "reward": 0.484375, + "reward_std": 0.7047210037708282, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.15625, + "rewards/format_reward/std": 0.3529609143733978, + "rewards/tag_count_reward/mean": 0.328125, + "rewards/tag_count_reward/std": 0.41176242381334305, + "step": 277, + "token_counts/after_target": 5167.0, + "token_counts/after_think": 99.5, + "token_counts/before_target": 4775.5, + "token_counts/before_think": 1234.25 + }, + { + "avg_penalty/after_target": 2.2536297738552094, + "avg_penalty/after_think": 3.9903661012649536, + "avg_penalty/before_target": 0.8483614027500153, + "avg_penalty/before_think": 0.8970254361629486, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 1004.75, + "completions/max_terminated_length": 906.25, + "completions/mean_length": 576.65625, + "completions/mean_terminated_length": 493.202392578125, + "completions/min_length": 23.25, + "completions/min_terminated_length": 23.25, + "epoch": 0.139, + "grad_norm": 14.666707038879395, + "kl": 21.328125, + "learning_rate": 1.9909831997148363e-05, + "loss": 1.735, + "num_tokens": 11554555.0, + "reward": 0.83203125, + "reward_std": 0.8430986404418945, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.3125, + "rewards/format_reward/std": 0.46566852182149887, + "rewards/tag_count_reward/mean": 0.51953125, + "rewards/tag_count_reward/std": 0.4411497935652733, + "step": 278, + "token_counts/after_target": 3120.5, + "token_counts/after_think": 389.0, + "token_counts/before_target": 3474.25, + "token_counts/before_think": 2242.75 + }, + { + "avg_penalty/after_target": 2.6151167154312134, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.9654459804296494, + "avg_penalty/before_think": 0.39242975786328316, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 565.75, + "completions/mean_length": 614.828125, + "completions/mean_terminated_length": 253.3545150756836, + "completions/min_length": 44.25, + "completions/min_terminated_length": 44.25, + "epoch": 0.1395, + "grad_norm": 24.028892517089844, + "kl": 39.75, + "learning_rate": 1.9907478404714438e-05, + "loss": 3.059, + "num_tokens": 11605008.0, + "reward": 0.30078125, + "reward_std": 0.49440594762563705, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.2257782220840454, + "rewards/tag_count_reward/mean": 0.22265625, + "rewards/tag_count_reward/std": 0.3055712655186653, + "step": 279, + "token_counts/after_target": 4657.75, + "token_counts/after_think": 1.5, + "token_counts/before_target": 4738.75, + "token_counts/before_think": 439.25 + }, + { + "avg_penalty/after_target": 2.5867176949977875, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 1.182446762919426, + "avg_penalty/before_think": 0.738055557012558, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 781.5, + "completions/mean_length": 589.609375, + "completions/mean_terminated_length": 347.56951904296875, + "completions/min_length": 56.5, + "completions/min_terminated_length": 56.5, + "epoch": 0.14, + "grad_norm": 31.87108612060547, + "kl": 22.8125, + "learning_rate": 1.990509463238309e-05, + "loss": 2.4427, + "num_tokens": 11652471.0, + "reward": 0.58984375, + "reward_std": 0.7133921831846237, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.1875, + "rewards/format_reward/std": 0.36180340498685837, + "rewards/tag_count_reward/mean": 0.40234375, + "rewards/tag_count_reward/std": 0.408612422645092, + "step": 280, + "token_counts/after_target": 4376.0, + "token_counts/after_think": 130.5, + "token_counts/before_target": 3651.75, + "token_counts/before_think": 1275.5 + }, + { + "avg_penalty/after_target": 2.716567039489746, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.8976949155330658, + "avg_penalty/before_think": 0.9138593673706055, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.390625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 784.75, + "completions/mean_length": 662.296875, + "completions/mean_terminated_length": 444.0548858642578, + "completions/min_length": 86.5, + "completions/min_terminated_length": 86.5, + "epoch": 0.1405, + "grad_norm": 22.26685333251953, + "kl": 20.9375, + "learning_rate": 1.9902680687415704e-05, + "loss": 2.1159, + "num_tokens": 11706426.0, + "reward": 0.5234375, + "reward_std": 0.6029567196965218, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.28694770485162735, + "rewards/tag_count_reward/mean": 0.3984375, + "rewards/tag_count_reward/std": 0.3659999892115593, + "step": 281, + "token_counts/after_target": 4421.25, + "token_counts/after_think": 100.75, + "token_counts/before_target": 3588.5, + "token_counts/before_think": 2486.25 + }, + { + "avg_penalty/after_target": 2.5031656622886658, + "avg_penalty/after_think": 1.3183899521827698, + "avg_penalty/before_target": 1.1832353174686432, + "avg_penalty/before_think": 0.7513403445482254, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.546875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 565.5, + "completions/mean_length": 687.84375, + "completions/mean_terminated_length": 283.8541717529297, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.141, + "grad_norm": 6.1981072425842285, + "kl": 32.75, + "learning_rate": 1.990023657716558e-05, + "loss": 2.8572, + "num_tokens": 11761120.0, + "reward": 0.31640625, + "reward_std": 0.4224400632083416, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.11180340498685837, + "rewards/tag_count_reward/mean": 0.25390625, + "rewards/tag_count_reward/std": 0.31521546468138695, + "step": 282, + "token_counts/after_target": 5501.0, + "token_counts/after_think": 77.0, + "token_counts/before_target": 4730.0, + "token_counts/before_think": 697.5 + }, + { + "avg_penalty/after_target": 2.415256917476654, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.8412190899252892, + "avg_penalty/before_think": 0.6820825934410095, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 968.0, + "completions/max_terminated_length": 623.5, + "completions/mean_length": 482.890625, + "completions/mean_terminated_length": 292.0364646911621, + "completions/min_length": 21.5, + "completions/min_terminated_length": 21.5, + "epoch": 0.1415, + "grad_norm": 11.25872802734375, + "kl": 29.5, + "learning_rate": 1.989776230907789e-05, + "loss": 2.3988, + "num_tokens": 11803641.0, + "reward": 0.75, + "reward_std": 0.806875005364418, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.28125, + "rewards/format_reward/std": 0.44495995342731476, + "rewards/tag_count_reward/mean": 0.46875, + "rewards/tag_count_reward/std": 0.4057661220431328, + "step": 283, + "token_counts/after_target": 2961.25, + "token_counts/after_think": 219.5, + "token_counts/before_target": 2804.5, + "token_counts/before_think": 1741.0 + }, + { + "avg_penalty/after_target": 2.4839335083961487, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 1.0899880081415176, + "avg_penalty/before_think": 0.9132130295038223, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 855.25, + "completions/mean_length": 632.21875, + "completions/mean_terminated_length": 454.1363754272461, + "completions/min_length": 103.5, + "completions/min_terminated_length": 103.5, + "epoch": 0.142, + "grad_norm": 10.29544448852539, + "kl": 29.59375, + "learning_rate": 1.9895257890689698e-05, + "loss": 2.4341, + "num_tokens": 11854887.0, + "reward": 0.63671875, + "reward_std": 0.7250611484050751, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.203125, + "rewards/format_reward/std": 0.39476002007722855, + "rewards/tag_count_reward/mean": 0.43359375, + "rewards/tag_count_reward/std": 0.39246875047683716, + "step": 284, + "token_counts/after_target": 4169.75, + "token_counts/after_think": 443.0, + "token_counts/before_target": 3182.75, + "token_counts/before_think": 2320.0 + }, + { + "avg_penalty/after_target": 2.82723069190979, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5300273671746254, + "avg_penalty/before_think": 0.6986219212412834, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 797.75, + "completions/mean_length": 372.59375, + "completions/mean_terminated_length": 279.8077087402344, + "completions/min_length": 13.75, + "completions/min_terminated_length": 13.75, + "epoch": 0.1425, + "grad_norm": 21.641178131103516, + "kl": 33.21875, + "learning_rate": 1.9892723329629885e-05, + "loss": 2.4588, + "num_tokens": 11891981.0, + "reward": 0.63671875, + "reward_std": 0.7294263541698456, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.1875, + "rewards/format_reward/std": 0.3987511098384857, + "rewards/tag_count_reward/mean": 0.44921875, + "rewards/tag_count_reward/std": 0.3903221860527992, + "step": 285, + "token_counts/after_target": 1956.0, + "token_counts/after_think": 156.75, + "token_counts/before_target": 2979.25, + "token_counts/before_think": 869.5 + }, + { + "avg_penalty/after_target": 2.058988869190216, + "avg_penalty/after_think": 3.4795645475387573, + "avg_penalty/before_target": 0.44646981358528137, + "avg_penalty/before_think": 0.7052328139543533, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 931.5, + "completions/max_terminated_length": 699.25, + "completions/mean_length": 318.25, + "completions/mean_terminated_length": 271.81607818603516, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.143, + "grad_norm": 6.665306568145752, + "kl": 19.03125, + "learning_rate": 1.989015863361917e-05, + "loss": 1.5781, + "num_tokens": 11921133.0, + "reward": 0.83984375, + "reward_std": 0.7432484552264214, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.265625, + "rewards/format_reward/std": 0.42430340498685837, + "rewards/tag_count_reward/mean": 0.57421875, + "rewards/tag_count_reward/std": 0.3877384662628174, + "step": 286, + "token_counts/after_target": 1209.0, + "token_counts/after_think": 97.25, + "token_counts/before_target": 2834.0, + "token_counts/before_think": 951.75 + }, + { + "avg_penalty/after_target": 2.1587631702423096, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.48445483297109604, + "avg_penalty/before_think": 0.5606259554624557, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 796.0, + "completions/max_terminated_length": 760.75, + "completions/mean_length": 331.1875, + "completions/mean_terminated_length": 309.8526840209961, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.1435, + "grad_norm": 18.964948654174805, + "kl": 7.0859375, + "learning_rate": 1.988756381047006e-05, + "loss": 0.9573, + "num_tokens": 11950377.0, + "reward": 1.1796875, + "reward_std": 0.8165727853775024, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.484375, + "rewards/format_reward/std": 0.498777836561203, + "rewards/tag_count_reward/mean": 0.6953125, + "rewards/tag_count_reward/std": 0.36101629585027695, + "step": 287, + "token_counts/after_target": 1210.0, + "token_counts/after_think": 128.0, + "token_counts/before_target": 2471.5, + "token_counts/before_think": 1489.5 + }, + { + "avg_penalty/after_target": 1.9927085638046265, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.49824562668800354, + "avg_penalty/before_think": 0.6754222214221954, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 722.0, + "completions/max_terminated_length": 641.5, + "completions/mean_length": 332.703125, + "completions/mean_terminated_length": 321.9729232788086, + "completions/min_length": 36.5, + "completions/min_terminated_length": 36.5, + "epoch": 0.144, + "grad_norm": 21.432748794555664, + "kl": 5.5, + "learning_rate": 1.9884938868086836e-05, + "loss": 0.8298, + "num_tokens": 11983366.0, + "reward": 1.20703125, + "reward_std": 0.8303260952234268, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.484375, + "rewards/format_reward/std": 0.5112857818603516, + "rewards/tag_count_reward/mean": 0.72265625, + "rewards/tag_count_reward/std": 0.378871314227581, + "step": 288, + "token_counts/after_target": 1055.25, + "token_counts/after_think": 227.25, + "token_counts/before_target": 2672.25, + "token_counts/before_think": 1368.5 + }, + { + "avg_penalty/after_target": 2.721063554286957, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.6919018253684044, + "avg_penalty/before_think": 0.558097667992115, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 903.75, + "completions/max_terminated_length": 687.25, + "completions/mean_length": 318.0625, + "completions/mean_terminated_length": 219.4980926513672, + "completions/min_length": 16.25, + "completions/min_terminated_length": 16.25, + "epoch": 0.1445, + "grad_norm": 23.175342559814453, + "kl": 14.4296875, + "learning_rate": 1.988228381446553e-05, + "loss": 1.5586, + "num_tokens": 12014410.0, + "reward": 0.66015625, + "reward_std": 0.679003544151783, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.234375, + "rewards/format_reward/std": 0.32528156042099, + "rewards/tag_count_reward/mean": 0.42578125, + "rewards/tag_count_reward/std": 0.396708145737648, + "step": 289, + "token_counts/after_target": 1896.75, + "token_counts/after_think": 52.0, + "token_counts/before_target": 2412.25, + "token_counts/before_think": 728.0 + }, + { + "avg_penalty/after_target": 2.4905713349580765, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.6084900796413422, + "avg_penalty/before_think": 0.48501240089535713, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 274.875, + "completions/mean_terminated_length": 181.73490142822266, + "completions/min_length": 6.25, + "completions/min_terminated_length": 6.25, + "epoch": 0.145, + "grad_norm": 17.73133659362793, + "kl": 28.125, + "learning_rate": 1.9879598657693894e-05, + "loss": 2.1053, + "num_tokens": 12041474.0, + "reward": 0.3671875, + "reward_std": 0.5940818376839161, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.2750816270709038, + "rewards/tag_count_reward/mean": 0.2421875, + "rewards/tag_count_reward/std": 0.3429615683853626, + "step": 290, + "token_counts/after_target": 1306.0, + "token_counts/after_think": 51.75, + "token_counts/before_target": 2702.5, + "token_counts/before_think": 337.75 + }, + { + "avg_penalty/after_target": 2.014268606901169, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.6470592990517616, + "avg_penalty/before_think": 0.21106106042861938, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 851.5, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 257.75, + "completions/mean_terminated_length": 134.52497673034668, + "completions/min_length": 10.75, + "completions/min_terminated_length": 10.75, + "epoch": 0.1455, + "grad_norm": 43.309635162353516, + "kl": 34.6875, + "learning_rate": 1.9876883405951378e-05, + "loss": 2.0219, + "num_tokens": 12068434.0, + "reward": 0.3203125, + "reward_std": 0.5336585342884064, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.2675696536898613, + "rewards/tag_count_reward/mean": 0.1953125, + "rewards/tag_count_reward/std": 0.2811735272407532, + "step": 291, + "token_counts/after_target": 1480.25, + "token_counts/after_think": 13.0, + "token_counts/before_target": 2418.25, + "token_counts/before_think": 212.5 + }, + { + "avg_penalty/after_target": 2.2979260981082916, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.4929722435772419, + "avg_penalty/before_think": 0.8604393526911736, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 885.75, + "completions/max_terminated_length": 625.75, + "completions/mean_length": 283.828125, + "completions/mean_terminated_length": 190.82926559448242, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.146, + "grad_norm": 22.871234893798828, + "kl": 25.5390625, + "learning_rate": 1.9874138067509116e-05, + "loss": 1.6404, + "num_tokens": 12096567.0, + "reward": 0.5546875, + "reward_std": 0.7644912004470825, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.21875, + "rewards/format_reward/std": 0.3925696536898613, + "rewards/tag_count_reward/mean": 0.3359375, + "rewards/tag_count_reward/std": 0.41520074009895325, + "step": 292, + "token_counts/after_target": 1357.0, + "token_counts/after_think": 12.75, + "token_counts/before_target": 2720.5, + "token_counts/before_think": 451.0 + }, + { + "avg_penalty/after_target": 2.372139185667038, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.7636692896485329, + "avg_penalty/before_think": 0.8155692517757416, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 683.5, + "completions/mean_length": 354.75, + "completions/mean_terminated_length": 215.09226608276367, + "completions/min_length": 28.75, + "completions/min_terminated_length": 28.75, + "epoch": 0.1465, + "grad_norm": 27.688213348388672, + "kl": 15.984375, + "learning_rate": 1.987136265072988e-05, + "loss": 1.6329, + "num_tokens": 12133367.0, + "reward": 0.8359375, + "reward_std": 0.87331223487854, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.328125, + "rewards/format_reward/std": 0.48148179799318314, + "rewards/tag_count_reward/mean": 0.5078125, + "rewards/tag_count_reward/std": 0.44140133261680603, + "step": 293, + "token_counts/after_target": 1834.5, + "token_counts/after_think": 120.25, + "token_counts/before_target": 2616.0, + "token_counts/before_think": 1105.25 + }, + { + "avg_penalty/after_target": 2.3733507692813873, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.8334432989358902, + "avg_penalty/before_think": 0.6549399830400944, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 568.5, + "completions/mean_length": 377.484375, + "completions/mean_terminated_length": 227.71813583374023, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.147, + "grad_norm": 21.39006996154785, + "kl": 21.8125, + "learning_rate": 1.9868557164068073e-05, + "loss": 2.1336, + "num_tokens": 12166934.0, + "reward": 0.8046875, + "reward_std": 0.863653764128685, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.3125, + "rewards/format_reward/std": 0.4761601909995079, + "rewards/tag_count_reward/mean": 0.4765625, + "rewards/tag_count_reward/std": 0.43330667167901993, + "step": 294, + "token_counts/after_target": 2312.0, + "token_counts/after_think": 100.0, + "token_counts/before_target": 3001.5, + "token_counts/before_think": 626.25 + }, + { + "avg_penalty/after_target": 2.4802859723567963, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.8341282457113266, + "avg_penalty/before_think": 0.7223187685012817, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 673.75, + "completions/mean_length": 462.796875, + "completions/mean_terminated_length": 324.06900787353516, + "completions/min_length": 53.5, + "completions/min_terminated_length": 53.5, + "epoch": 0.1475, + "grad_norm": 18.321474075317383, + "kl": 19.59375, + "learning_rate": 1.9865721616069695e-05, + "loss": 2.0527, + "num_tokens": 12206729.0, + "reward": 0.9375, + "reward_std": 0.8880471587181091, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.4909028485417366, + "rewards/tag_count_reward/mean": 0.5625, + "rewards/tag_count_reward/std": 0.45223936438560486, + "step": 295, + "token_counts/after_target": 2639.75, + "token_counts/after_think": 228.25, + "token_counts/before_target": 3206.75, + "token_counts/before_think": 1330.0 + }, + { + "avg_penalty/after_target": 2.745223879814148, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.8722589015960693, + "avg_penalty/before_think": 0.5185610800981522, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 573.5, + "completions/mean_length": 410.703125, + "completions/mean_terminated_length": 253.82373046875, + "completions/min_length": 44.5, + "completions/min_terminated_length": 44.5, + "epoch": 0.148, + "grad_norm": 14.780112266540527, + "kl": 26.984375, + "learning_rate": 1.9862856015372315e-05, + "loss": 2.5383, + "num_tokens": 12244630.0, + "reward": 0.8515625, + "reward_std": 0.8397154808044434, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4622559919953346, + "rewards/tag_count_reward/mean": 0.5078125, + "rewards/tag_count_reward/std": 0.41714833676815033, + "step": 296, + "token_counts/after_target": 2963.25, + "token_counts/after_think": 29.75, + "token_counts/before_target": 2748.25, + "token_counts/before_think": 830.0 + }, + { + "avg_penalty/after_target": 2.421932637691498, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.7624981701374054, + "avg_penalty/before_think": 0.8051125407218933, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 758.0, + "completions/mean_length": 440.75, + "completions/mean_terminated_length": 262.82153701782227, + "completions/min_length": 29.25, + "completions/min_terminated_length": 29.25, + "epoch": 0.1485, + "grad_norm": 62.9839973449707, + "kl": 52.5625, + "learning_rate": 1.985996037070505e-05, + "loss": 3.2192, + "num_tokens": 12282742.0, + "reward": 0.70703125, + "reward_std": 0.8613292574882507, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.28125, + "rewards/format_reward/std": 0.4598134011030197, + "rewards/tag_count_reward/mean": 0.42578125, + "rewards/tag_count_reward/std": 0.43535182625055313, + "step": 297, + "token_counts/after_target": 2592.75, + "token_counts/after_think": 16.25, + "token_counts/before_target": 3548.25, + "token_counts/before_think": 894.75 + }, + { + "avg_penalty/after_target": 2.743264317512512, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.8804412633180618, + "avg_penalty/before_think": 0.353105079382658, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 586.25, + "completions/mean_length": 312.890625, + "completions/mean_terminated_length": 164.7821502685547, + "completions/min_length": 11.5, + "completions/min_terminated_length": 11.5, + "epoch": 0.149, + "grad_norm": 120.00735473632812, + "kl": 68.0625, + "learning_rate": 1.985703469088854e-05, + "loss": 3.9888, + "num_tokens": 12314623.0, + "reward": 0.47265625, + "reward_std": 0.6396936923265457, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.15625, + "rewards/format_reward/std": 0.33539126068353653, + "rewards/tag_count_reward/mean": 0.31640625, + "rewards/tag_count_reward/std": 0.3547080457210541, + "step": 298, + "token_counts/after_target": 2029.5, + "token_counts/after_think": 1.75, + "token_counts/before_target": 2473.75, + "token_counts/before_think": 501.25 + }, + { + "avg_penalty/after_target": 2.2073394060134888, + "avg_penalty/after_think": 2.641873061656952, + "avg_penalty/before_target": 0.6725566014647484, + "avg_penalty/before_think": 0.5679607167840004, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 657.75, + "completions/mean_length": 328.8125, + "completions/mean_terminated_length": 216.8256492614746, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.1495, + "grad_norm": 69.925048828125, + "kl": 56.0, + "learning_rate": 1.9854078984834904e-05, + "loss": 3.2993, + "num_tokens": 12344899.0, + "reward": 0.6796875, + "reward_std": 0.8639078736305237, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.28125, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.3984375, + "rewards/tag_count_reward/std": 0.4396772161126137, + "step": 299, + "token_counts/after_target": 1614.25, + "token_counts/after_think": 149.75, + "token_counts/before_target": 2925.5, + "token_counts/before_think": 571.5 + }, + { + "avg_penalty/after_target": 2.6940532624721527, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.40989043936133385, + "avg_penalty/before_think": 0.44438889250159264, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 810.75, + "completions/max_terminated_length": 681.75, + "completions/mean_length": 269.359375, + "completions/mean_terminated_length": 245.11042404174805, + "completions/min_length": 27.5, + "completions/min_terminated_length": 27.5, + "epoch": 0.15, + "grad_norm": 28.80791473388672, + "kl": 33.46875, + "learning_rate": 1.985109326154774e-05, + "loss": 2.0905, + "num_tokens": 12374058.0, + "reward": 0.796875, + "reward_std": 0.8279780596494675, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.3125, + "rewards/format_reward/std": 0.45129410922527313, + "rewards/tag_count_reward/mean": 0.484375, + "rewards/tag_count_reward/std": 0.41455313563346863, + "step": 300, + "token_counts/after_target": 836.0, + "token_counts/after_think": 79.5, + "token_counts/before_target": 2702.75, + "token_counts/before_think": 691.5 + }, + { + "avg_penalty/after_target": 2.6425983905792236, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 1.2322617769241333, + "avg_penalty/before_think": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 329.75, + "completions/mean_length": 830.796875, + "completions/mean_terminated_length": 138.13572120666504, + "completions/min_length": 269.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.1505, + "grad_norm": 18.585830688476562, + "kl": 29.6875, + "learning_rate": 1.9848077530122083e-05, + "loss": 2.7793, + "num_tokens": 12439197.0, + "reward": 0.04296875, + "reward_std": 0.12323814257979393, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.04296875, + "rewards/tag_count_reward/std": 0.12323814444243908, + "step": 301, + "token_counts/after_target": 7505.75, + "token_counts/after_think": 0.0, + "token_counts/before_target": 5787.0, + "token_counts/before_think": 0.0 + }, + { + "avg_penalty/after_target": 2.6398611068725586, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 1.3101705312728882, + "avg_penalty/before_think": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 300.75, + "completions/mean_length": 948.4375, + "completions/mean_terminated_length": 248.08333587646484, + "completions/min_length": 463.5, + "completions/min_terminated_length": 207.5, + "epoch": 0.151, + "grad_norm": 31.840965270996094, + "kl": 42.875, + "learning_rate": 1.9845031799744367e-05, + "loss": 3.1622, + "num_tokens": 12508409.0, + "reward": 0.00390625, + "reward_std": 0.015625, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.00390625, + "rewards/tag_count_reward/std": 0.015625, + "step": 302, + "token_counts/after_target": 8741.5, + "token_counts/after_think": 0.0, + "token_counts/before_target": 6433.5, + "token_counts/before_think": 0.0 + }, + { + "avg_penalty/after_target": 2.675745368003845, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 1.3546742796897888, + "avg_penalty/before_think": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 27.75, + "completions/mean_length": 994.140625, + "completions/mean_terminated_length": 17.125, + "completions/min_length": 774.5, + "completions/min_terminated_length": 6.5, + "epoch": 0.1515, + "grad_norm": 23.8353271484375, + "kl": 26.625, + "learning_rate": 1.984195607969242e-05, + "loss": 2.6133, + "num_tokens": 12583154.0, + "reward": 0.03125, + "reward_std": 0.09923820197582245, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.03125, + "rewards/tag_count_reward/std": 0.09923820197582245, + "step": 303, + "token_counts/after_target": 9368.75, + "token_counts/after_think": 0.0, + "token_counts/before_target": 6537.5, + "token_counts/before_think": 0.0 + }, + { + "avg_penalty/after_target": 2.6769068837165833, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 1.355151891708374, + "avg_penalty/before_think": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 5.5, + "completions/mean_length": 992.34375, + "completions/mean_terminated_length": 5.5, + "completions/min_length": 517.5, + "completions/min_terminated_length": 5.5, + "epoch": 0.152, + "grad_norm": 27.69397735595703, + "kl": 24.21875, + "learning_rate": 1.983885037933542e-05, + "loss": 2.4885, + "num_tokens": 12655176.0, + "reward": 0.03515625, + "reward_std": 0.09914018213748932, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.03515625, + "rewards/tag_count_reward/std": 0.09914018586277962, + "step": 304, + "token_counts/after_target": 9358.5, + "token_counts/after_think": 0.0, + "token_counts/before_target": 6519.0, + "token_counts/before_think": 0.0 + }, + { + "avg_penalty/after_target": 2.6069682836532593, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 1.3232918083667755, + "avg_penalty/before_think": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.25, + "completions/mean_length": 1008.015625, + "completions/mean_terminated_length": 0.25, + "completions/min_length": 768.25, + "completions/min_terminated_length": 0.25, + "epoch": 0.1525, + "grad_norm": 5.469938278198242, + "kl": 31.71875, + "learning_rate": 1.983571470813386e-05, + "loss": 2.7045, + "num_tokens": 12732041.0, + "reward": 0.0234375, + "reward_std": 0.062167368829250336, + "rewards/accuracy_reward/mean": NaN, + "rewards/accuracy_reward/std": NaN, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.0234375, + "rewards/tag_count_reward/std": 0.062167370691895485, + "step": 305, + "token_counts/after_target": 9247.75, + "token_counts/after_think": 0.0, + "token_counts/before_target": 6880.5, + "token_counts/before_think": 0.0 + }, + { + "avg_penalty/after_target": 2.6537309288978577, + "avg_penalty/after_think": 0.6095497012138367, + "avg_penalty/before_target": 1.286358505487442, + "avg_penalty/before_think": 1.5762481093406677, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.921875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 960.671875, + "completions/mean_terminated_length": 101.875, + "completions/min_length": 523.25, + "completions/min_terminated_length": 11.25, + "epoch": 0.153, + "grad_norm": 23.236915588378906, + "kl": 26.34375, + "learning_rate": 1.983254907563955e-05, + "loss": 2.5637, + "num_tokens": 12804468.0, + "reward": 0.25390625, + "reward_std": 0.39182528108358383, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.109375, + "rewards/format_reward/std": 0.3186737820506096, + "rewards/tag_count_reward/mean": 0.14453125, + "rewards/tag_count_reward/std": 0.199452081695199, + "step": 306, + "token_counts/after_target": 8749.5, + "token_counts/after_think": 3.5, + "token_counts/before_target": 6313.25, + "token_counts/before_think": 304.5 + }, + { + "avg_penalty/after_target": 2.625012993812561, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 1.2780249118804932, + "avg_penalty/before_think": 1.3366167396306992, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 502.75, + "completions/mean_length": 934.546875, + "completions/mean_terminated_length": 323.4375, + "completions/min_length": 229.25, + "completions/min_terminated_length": 229.25, + "epoch": 0.1535, + "grad_norm": 21.032495498657227, + "kl": 25.34375, + "learning_rate": 1.9829353491495545e-05, + "loss": 2.4833, + "num_tokens": 12877175.0, + "reward": 0.28125, + "reward_std": 0.341427281498909, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.109375, + "rewards/format_reward/std": 0.27156074345111847, + "rewards/tag_count_reward/mean": 0.171875, + "rewards/tag_count_reward/std": 0.20332757383584976, + "step": 307, + "token_counts/after_target": 8179.0, + "token_counts/after_think": 64.75, + "token_counts/before_target": 6062.25, + "token_counts/before_think": 646.75 + }, + { + "avg_penalty/after_target": 2.6670612692832947, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 1.2565203607082367, + "avg_penalty/before_think": 1.9804063886404037, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 533.5, + "completions/mean_length": 917.28125, + "completions/mean_terminated_length": 344.1666717529297, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.154, + "grad_norm": 12.6159029006958, + "kl": 29.78125, + "learning_rate": 1.9826127965436153e-05, + "loss": 2.572, + "num_tokens": 12946057.0, + "reward": 0.31640625, + "reward_std": 0.39725008606910706, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.109375, + "rewards/format_reward/std": 0.2596946656703949, + "rewards/tag_count_reward/mean": 0.20703125, + "rewards/tag_count_reward/std": 0.24980651959776878, + "step": 308, + "token_counts/after_target": 7942.75, + "token_counts/after_think": 180.75, + "token_counts/before_target": 5963.5, + "token_counts/before_think": 589.5 + }, + { + "avg_penalty/after_target": 2.7091159224510193, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 1.3196857869625092, + "avg_penalty/before_think": 1.236641675233841, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 260.25, + "completions/mean_length": 981.625, + "completions/mean_terminated_length": 229.0, + "completions/min_length": 453.75, + "completions/min_terminated_length": 197.75, + "epoch": 0.1545, + "grad_norm": 38.92384338378906, + "kl": 46.875, + "learning_rate": 1.982287250728689e-05, + "loss": 3.3775, + "num_tokens": 13018577.0, + "reward": 0.2109375, + "reward_std": 0.3585582822561264, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.09375, + "rewards/format_reward/std": 0.24866948276758194, + "rewards/tag_count_reward/mean": 0.1171875, + "rewards/tag_count_reward/std": 0.17749575711786747, + "step": 309, + "token_counts/after_target": 8886.25, + "token_counts/after_think": 118.0, + "token_counts/before_target": 6367.75, + "token_counts/before_think": 334.0 + }, + { + "avg_penalty/after_target": 2.5850167274475098, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 1.3044176697731018, + "avg_penalty/before_think": 1.615450233221054, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.953125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 155.5, + "completions/mean_length": 985.8125, + "completions/mean_terminated_length": 153.5, + "completions/min_length": 663.5, + "completions/min_terminated_length": 151.5, + "epoch": 0.155, + "grad_norm": 42.05648422241211, + "kl": 44.6875, + "learning_rate": 1.981958712696444e-05, + "loss": 3.193, + "num_tokens": 13091573.0, + "reward": 0.1484375, + "reward_std": 0.2474822662770748, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.08539126068353653, + "rewards/tag_count_reward/mean": 0.1171875, + "rewards/tag_count_reward/std": 0.18448801152408123, + "step": 310, + "token_counts/after_target": 8881.25, + "token_counts/after_think": 55.5, + "token_counts/before_target": 6504.5, + "token_counts/before_think": 331.75 + }, + { + "avg_penalty/after_target": 2.655960500240326, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 1.3105816543102264, + "avg_penalty/before_think": 1.4242450594902039, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 81.75, + "completions/mean_length": 965.1875, + "completions/mean_terminated_length": 80.375, + "completions/min_length": 335.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.1555, + "grad_norm": 11.015217781066895, + "kl": 27.03125, + "learning_rate": 1.9816271834476642e-05, + "loss": 2.4886, + "num_tokens": 13164273.0, + "reward": 0.14453125, + "reward_std": 0.20508235692977905, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.14453125, + "rewards/tag_count_reward/std": 0.20508236438035965, + "step": 311, + "token_counts/after_target": 8786.25, + "token_counts/after_think": 9.0, + "token_counts/before_target": 6087.75, + "token_counts/before_think": 560.0 + }, + { + "avg_penalty/after_target": 2.7255899310112, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 1.3078042268753052, + "avg_penalty/before_think": 1.2055795341730118, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 432.75, + "completions/mean_length": 907.703125, + "completions/mean_terminated_length": 307.17857360839844, + "completions/min_length": 480.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.156, + "grad_norm": 36.580909729003906, + "kl": 18.9375, + "learning_rate": 1.981292663992245e-05, + "loss": 2.2687, + "num_tokens": 13232494.0, + "reward": 0.29296875, + "reward_std": 0.39635635167360306, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.09375, + "rewards/format_reward/std": 0.24866948276758194, + "rewards/tag_count_reward/mean": 0.18359375, + "rewards/tag_count_reward/std": 0.2298159934580326, + "step": 312, + "token_counts/after_target": 8166.25, + "token_counts/after_think": 135.25, + "token_counts/before_target": 5331.5, + "token_counts/before_think": 890.25 + }, + { + "avg_penalty/after_target": 2.6656474471092224, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 1.2855326533317566, + "avg_penalty/before_think": 1.531068816781044, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.890625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 953.515625, + "completions/mean_terminated_length": 348.375, + "completions/min_length": 244.75, + "completions/min_terminated_length": 244.75, + "epoch": 0.1565, + "grad_norm": 18.074430465698242, + "kl": 25.0, + "learning_rate": 1.9809551553491918e-05, + "loss": 2.4933, + "num_tokens": 13303279.0, + "reward": 0.23046875, + "reward_std": 0.3075352869927883, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.14789126068353653, + "rewards/tag_count_reward/mean": 0.18359375, + "rewards/tag_count_reward/std": 0.1952817179262638, + "step": 313, + "token_counts/after_target": 8285.5, + "token_counts/after_think": 206.5, + "token_counts/before_target": 5398.75, + "token_counts/before_think": 1365.5 + }, + { + "avg_penalty/after_target": 2.7026100754737854, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 1.2488023936748505, + "avg_penalty/before_think": 0.9544152840971947, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.890625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 547.75, + "completions/mean_length": 967.578125, + "completions/mean_terminated_length": 467.0416717529297, + "completions/min_length": 376.25, + "completions/min_terminated_length": 376.25, + "epoch": 0.157, + "grad_norm": 44.01554870605469, + "kl": 48.8125, + "learning_rate": 1.980614658546613e-05, + "loss": 3.4076, + "num_tokens": 13377540.0, + "reward": 0.10546875, + "reward_std": 0.16517486423254013, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.10546875, + "rewards/tag_count_reward/std": 0.16517487168312073, + "step": 314, + "token_counts/after_target": 8765.25, + "token_counts/after_think": 0.0, + "token_counts/before_target": 6237.75, + "token_counts/before_think": 478.25 + }, + { + "avg_penalty/after_target": 2.4957857728004456, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 1.220296025276184, + "avg_penalty/before_think": 0.8391263484954834, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.84375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 534.5, + "completions/mean_length": 907.1875, + "completions/mean_terminated_length": 312.9166669845581, + "completions/min_length": 185.75, + "completions/min_terminated_length": 185.75, + "epoch": 0.1575, + "grad_norm": 55.84909439086914, + "kl": 50.75, + "learning_rate": 1.9802711746217222e-05, + "loss": 3.3118, + "num_tokens": 13446720.0, + "reward": 0.15625, + "reward_std": 0.2481921948492527, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.140625, + "rewards/tag_count_reward/std": 0.20665005967020988, + "step": 315, + "token_counts/after_target": 7807.25, + "token_counts/after_think": 30.5, + "token_counts/before_target": 5945.75, + "token_counts/before_think": 731.5 + }, + { + "avg_penalty/after_target": 2.573690176010132, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 1.2084460854530334, + "avg_penalty/before_think": 2.2813847064971924, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.84375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 260.75, + "completions/mean_length": 891.203125, + "completions/mean_terminated_length": 204.8125, + "completions/min_length": 173.5, + "completions/min_terminated_length": 173.5, + "epoch": 0.158, + "grad_norm": 30.553543090820312, + "kl": 43.5625, + "learning_rate": 1.9799247046208297e-05, + "loss": 3.1303, + "num_tokens": 13513869.0, + "reward": 0.12890625, + "reward_std": 0.18996241316199303, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.12890625, + "rewards/tag_count_reward/std": 0.18996241688728333, + "step": 316, + "token_counts/after_target": 7795.0, + "token_counts/after_think": 186.5, + "token_counts/before_target": 5855.75, + "token_counts/before_think": 422.0 + }, + { + "avg_penalty/after_target": 2.377823770046234, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 1.2046368718147278, + "avg_penalty/before_think": 0.6414149850606918, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 600.5, + "completions/mean_length": 860.640625, + "completions/mean_terminated_length": 348.54167556762695, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.1585, + "grad_norm": 4.363918304443359, + "kl": 29.84375, + "learning_rate": 1.979575249599344e-05, + "loss": 2.4902, + "num_tokens": 13581398.0, + "reward": 0.140625, + "reward_std": 0.24309122934937477, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.125, + "rewards/tag_count_reward/std": 0.19152813032269478, + "step": 317, + "token_counts/after_target": 7325.0, + "token_counts/after_think": 2.25, + "token_counts/before_target": 6193.25, + "token_counts/before_think": 249.75 + }, + { + "avg_penalty/after_target": 2.546196222305298, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.7693897187709808, + "avg_penalty/before_think": 1.0290547162294388, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 808.5, + "completions/mean_length": 562.8125, + "completions/mean_terminated_length": 367.66839599609375, + "completions/min_length": 35.5, + "completions/min_terminated_length": 35.5, + "epoch": 0.159, + "grad_norm": 24.626846313476562, + "kl": 13.609375, + "learning_rate": 1.979222810621766e-05, + "loss": 1.7502, + "num_tokens": 13625178.0, + "reward": 0.31640625, + "reward_std": 0.27623454481363297, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.30078125, + "rewards/tag_count_reward/std": 0.23986542969942093, + "step": 318, + "token_counts/after_target": 4008.0, + "token_counts/after_think": 59.5, + "token_counts/before_target": 4336.25, + "token_counts/before_think": 601.25 + }, + { + "avg_penalty/after_target": 2.502116560935974, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.8201390206813812, + "avg_penalty/before_think": 0.7992706224322319, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 837.75, + "completions/mean_length": 471.671875, + "completions/mean_terminated_length": 368.0549545288086, + "completions/min_length": 81.75, + "completions/min_terminated_length": 81.75, + "epoch": 0.1595, + "grad_norm": 26.359481811523438, + "kl": 8.0859375, + "learning_rate": 1.9788673887616852e-05, + "loss": 1.4501, + "num_tokens": 13663157.0, + "reward": 0.4921875, + "reward_std": 0.5240374878048897, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.140625, + "rewards/format_reward/std": 0.34944770485162735, + "rewards/tag_count_reward/mean": 0.3515625, + "rewards/tag_count_reward/std": 0.2822730205953121, + "step": 319, + "token_counts/after_target": 3156.5, + "token_counts/after_think": 39.25, + "token_counts/before_target": 3735.0, + "token_counts/before_think": 616.0 + }, + { + "avg_penalty/after_target": 3.266851305961609, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.5222049877047539, + "avg_penalty/before_think": 0.540187481790781, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 740.25, + "completions/max_terminated_length": 734.75, + "completions/mean_length": 277.921875, + "completions/mean_terminated_length": 242.7439956665039, + "completions/min_length": 29.5, + "completions/min_terminated_length": 29.5, + "epoch": 0.16, + "grad_norm": 23.735389709472656, + "kl": 10.234375, + "learning_rate": 1.9785089851017788e-05, + "loss": 1.5056, + "num_tokens": 13694048.0, + "reward": 0.54296875, + "reward_std": 0.5392824932932854, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.109375, + "rewards/format_reward/std": 0.3186737820506096, + "rewards/tag_count_reward/mean": 0.43359375, + "rewards/tag_count_reward/std": 0.33883266896009445, + "step": 320, + "token_counts/after_target": 1681.25, + "token_counts/after_think": 38.75, + "token_counts/before_target": 2221.25, + "token_counts/before_think": 505.5 + }, + { + "avg_penalty/after_target": 3.0427240133285522, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.3344067297875881, + "avg_penalty/before_think": 0.311693973839283, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.5, + "completions/max_terminated_length": 582.5, + "completions/mean_length": 167.703125, + "completions/mean_terminated_length": 167.703125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.1605, + "grad_norm": 13.477130889892578, + "kl": 18.8125, + "learning_rate": 1.9781476007338058e-05, + "loss": 1.324, + "num_tokens": 13714349.0, + "reward": 0.4609375, + "reward_std": 0.5396609604358673, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.09375, + "rewards/format_reward/std": 0.24866948276758194, + "rewards/tag_count_reward/mean": 0.3671875, + "rewards/tag_count_reward/std": 0.33512863516807556, + "step": 321, + "token_counts/after_target": 537.5, + "token_counts/after_think": 11.0, + "token_counts/before_target": 1835.0, + "token_counts/before_think": 299.75 + }, + { + "avg_penalty/after_target": 2.8263776302337646, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.6061085984110832, + "avg_penalty/before_think": 0.6063809618353844, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 814.0, + "completions/max_terminated_length": 735.25, + "completions/mean_length": 234.171875, + "completions/mean_terminated_length": 221.75104522705078, + "completions/min_length": 32.75, + "completions/min_terminated_length": 32.75, + "epoch": 0.161, + "grad_norm": 14.333884239196777, + "kl": 17.4375, + "learning_rate": 1.977783236758606e-05, + "loss": 1.748, + "num_tokens": 13739992.0, + "reward": 0.75, + "reward_std": 0.7302019745111465, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.44091323018074036, + "rewards/tag_count_reward/mean": 0.5, + "rewards/tag_count_reward/std": 0.37260495126247406, + "step": 322, + "token_counts/after_target": 1210.75, + "token_counts/after_think": 106.5, + "token_counts/before_target": 1829.0, + "token_counts/before_think": 600.5 + }, + { + "avg_penalty/after_target": 3.5169705152511597, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3428714647889137, + "avg_penalty/before_think": 0.31862450391054153, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 659.75, + "completions/max_terminated_length": 659.75, + "completions/mean_length": 236.296875, + "completions/mean_terminated_length": 236.296875, + "completions/min_length": 55.75, + "completions/min_terminated_length": 55.75, + "epoch": 0.1615, + "grad_norm": 5.222476005554199, + "kl": 20.3125, + "learning_rate": 1.9774158942860962e-05, + "loss": 1.6372, + "num_tokens": 13764859.0, + "reward": 0.90234375, + "reward_std": 0.8176808953285217, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.328125, + "rewards/format_reward/std": 0.47354350984096527, + "rewards/tag_count_reward/mean": 0.57421875, + "rewards/tag_count_reward/std": 0.437775120139122, + "step": 323, + "token_counts/after_target": 816.0, + "token_counts/after_think": 35.0, + "token_counts/before_target": 2246.0, + "token_counts/before_think": 683.75 + }, + { + "avg_penalty/after_target": 2.2062294483184814, + "avg_penalty/after_think": 2.669892966747284, + "avg_penalty/before_target": 0.5273224152624607, + "avg_penalty/before_think": 0.4943709261715412, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 765.5, + "completions/max_terminated_length": 695.5, + "completions/mean_length": 246.671875, + "completions/mean_terminated_length": 222.9933090209961, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.162, + "grad_norm": 21.46671485900879, + "kl": 32.59375, + "learning_rate": 1.977045574435264e-05, + "loss": 2.1746, + "num_tokens": 13793910.0, + "reward": 0.73046875, + "reward_std": 0.8752614110708237, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.296875, + "rewards/format_reward/std": 0.4682852029800415, + "rewards/tag_count_reward/mean": 0.43359375, + "rewards/tag_count_reward/std": 0.45916733145713806, + "step": 324, + "token_counts/after_target": 1115.25, + "token_counts/after_think": 11.5, + "token_counts/before_target": 2230.0, + "token_counts/before_think": 590.0 + }, + { + "avg_penalty/after_target": 2.320166051387787, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.4431834891438484, + "avg_penalty/before_think": 0.2759392000734806, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 700.25, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 247.171875, + "completions/mean_terminated_length": 222.36607360839844, + "completions/min_length": 42.75, + "completions/min_terminated_length": 42.75, + "epoch": 0.1625, + "grad_norm": 20.957643508911133, + "kl": 31.0625, + "learning_rate": 1.9766722783341682e-05, + "loss": 2.0389, + "num_tokens": 13819601.0, + "reward": 0.7890625, + "reward_std": 0.9127470403909683, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4745560586452484, + "rewards/tag_count_reward/mean": 0.4453125, + "rewards/tag_count_reward/std": 0.47082383185625076, + "step": 325, + "token_counts/after_target": 863.5, + "token_counts/after_think": 148.5, + "token_counts/before_target": 2458.0, + "token_counts/before_think": 484.75 + }, + { + "avg_penalty/after_target": 2.6125500202178955, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.7227107137441635, + "avg_penalty/before_think": 0.5054807141423225, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 1006.75, + "completions/max_terminated_length": 850.0, + "completions/mean_length": 431.546875, + "completions/mean_terminated_length": 367.48810958862305, + "completions/min_length": 66.5, + "completions/min_terminated_length": 66.5, + "epoch": 0.163, + "grad_norm": 12.098335266113281, + "kl": 31.03125, + "learning_rate": 1.9762960071199334e-05, + "loss": 2.4542, + "num_tokens": 13857140.0, + "reward": 0.41015625, + "reward_std": 0.7528323605656624, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.1875, + "rewards/format_reward/std": 0.3837348371744156, + "rewards/tag_count_reward/mean": 0.22265625, + "rewards/tag_count_reward/std": 0.3819378763437271, + "step": 326, + "token_counts/after_target": 2881.5, + "token_counts/after_think": 10.5, + "token_counts/before_target": 3789.0, + "token_counts/before_think": 223.75 + }, + { + "avg_penalty/after_target": 2.566158652305603, + "avg_penalty/after_think": 2.7709537744522095, + "avg_penalty/before_target": 0.5486475564539433, + "avg_penalty/before_think": 0.44507331401109695, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 868.0, + "completions/max_terminated_length": 736.5, + "completions/mean_length": 314.40625, + "completions/mean_terminated_length": 280.6974792480469, + "completions/min_length": 68.75, + "completions/min_terminated_length": 68.75, + "epoch": 0.1635, + "grad_norm": 13.582267761230469, + "kl": 18.484375, + "learning_rate": 1.9759167619387474e-05, + "loss": 1.9173, + "num_tokens": 13888286.0, + "reward": 0.7890625, + "reward_std": 0.8844085484743118, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.359375, + "rewards/format_reward/std": 0.46326854079961777, + "rewards/tag_count_reward/mean": 0.4296875, + "rewards/tag_count_reward/std": 0.45001547038555145, + "step": 327, + "token_counts/after_target": 1348.25, + "token_counts/after_think": 108.5, + "token_counts/before_target": 3153.75, + "token_counts/before_think": 420.0 + }, + { + "avg_penalty/after_target": 2.03668574988842, + "avg_penalty/after_think": 1.551540195941925, + "avg_penalty/before_target": 0.6289135664701462, + "avg_penalty/before_think": 0.4451485201716423, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 950.0, + "completions/max_terminated_length": 831.75, + "completions/mean_length": 358.640625, + "completions/mean_terminated_length": 313.5507278442383, + "completions/min_length": 45.75, + "completions/min_terminated_length": 45.75, + "epoch": 0.164, + "grad_norm": 7.940175533294678, + "kl": 26.34375, + "learning_rate": 1.9755345439458566e-05, + "loss": 2.1744, + "num_tokens": 13923575.0, + "reward": 0.52734375, + "reward_std": 0.840016707777977, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.203125, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.30859375, + "rewards/tag_count_reward/std": 0.4211016595363617, + "step": 328, + "token_counts/after_target": 1892.75, + "token_counts/after_think": 13.25, + "token_counts/before_target": 3402.0, + "token_counts/before_think": 430.25 + }, + { + "avg_penalty/after_target": 2.3240617215633392, + "avg_penalty/after_think": 1.4995016753673553, + "avg_penalty/before_target": 0.5477387458086014, + "avg_penalty/before_think": 0.6625381968915462, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 854.25, + "completions/max_terminated_length": 652.5, + "completions/mean_length": 301.140625, + "completions/mean_terminated_length": 263.2631034851074, + "completions/min_length": 47.75, + "completions/min_terminated_length": 47.75, + "epoch": 0.1645, + "grad_norm": 7.862532138824463, + "kl": 19.6875, + "learning_rate": 1.9751493543055634e-05, + "loss": 1.726, + "num_tokens": 13954256.0, + "reward": 0.921875, + "reward_std": 0.9960539788007736, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.453125, + "rewards/format_reward/std": 0.5071863383054733, + "rewards/tag_count_reward/mean": 0.46875, + "rewards/tag_count_reward/std": 0.50046007335186, + "step": 329, + "token_counts/after_target": 1464.25, + "token_counts/after_think": 40.5, + "token_counts/before_target": 2367.5, + "token_counts/before_think": 946.0 + }, + { + "avg_penalty/after_target": 2.4595097303390503, + "avg_penalty/after_think": 0.9352795779705048, + "avg_penalty/before_target": 0.4719106871634722, + "avg_penalty/before_think": 0.6617096811532974, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 853.0, + "completions/max_terminated_length": 761.75, + "completions/mean_length": 361.1875, + "completions/mean_terminated_length": 296.07814025878906, + "completions/min_length": 60.75, + "completions/min_terminated_length": 60.75, + "epoch": 0.165, + "grad_norm": 21.273073196411133, + "kl": 33.46875, + "learning_rate": 1.974761194191222e-05, + "loss": 2.1641, + "num_tokens": 13987772.0, + "reward": 0.6484375, + "reward_std": 0.9121766984462738, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.3125, + "rewards/format_reward/std": 0.4713720977306366, + "rewards/tag_count_reward/mean": 0.3359375, + "rewards/tag_count_reward/std": 0.4530441388487816, + "step": 330, + "token_counts/after_target": 1711.0, + "token_counts/after_think": 20.0, + "token_counts/before_target": 3236.75, + "token_counts/before_think": 811.25 + }, + { + "avg_penalty/after_target": 2.458571642637253, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3927019536495209, + "avg_penalty/before_think": 0.491722010076046, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 857.5, + "completions/max_terminated_length": 689.0, + "completions/mean_length": 336.625, + "completions/mean_terminated_length": 314.3645896911621, + "completions/min_length": 67.5, + "completions/min_terminated_length": 67.5, + "epoch": 0.1655, + "grad_norm": 6.557216167449951, + "kl": 20.390625, + "learning_rate": 1.9743700647852356e-05, + "loss": 1.6951, + "num_tokens": 14022868.0, + "reward": 0.82421875, + "reward_std": 0.9019670188426971, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.359375, + "rewards/format_reward/std": 0.479247085750103, + "rewards/tag_count_reward/mean": 0.46484375, + "rewards/tag_count_reward/std": 0.46115053445100784, + "step": 331, + "token_counts/after_target": 1407.5, + "token_counts/after_think": 74.0, + "token_counts/before_target": 2981.25, + "token_counts/before_think": 923.25 + }, + { + "avg_penalty/after_target": 2.1707046926021576, + "avg_penalty/after_think": 1.995798647403717, + "avg_penalty/before_target": 0.5265535116195679, + "avg_penalty/before_think": 0.36004872620105743, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 719.0, + "completions/mean_length": 368.71875, + "completions/mean_terminated_length": 301.00001525878906, + "completions/min_length": 105.75, + "completions/min_terminated_length": 105.75, + "epoch": 0.166, + "grad_norm": 3.0697340965270996, + "kl": 25.453125, + "learning_rate": 1.973975967279052e-05, + "loss": 1.9881, + "num_tokens": 14058258.0, + "reward": 0.8515625, + "reward_std": 0.9407677948474884, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.40625, + "rewards/format_reward/std": 0.49297719448804855, + "rewards/tag_count_reward/mean": 0.4453125, + "rewards/tag_count_reward/std": 0.4717831537127495, + "step": 332, + "token_counts/after_target": 1411.25, + "token_counts/after_think": 153.75, + "token_counts/before_target": 3313.0, + "token_counts/before_think": 1021.5 + }, + { + "avg_penalty/after_target": 2.4842593669891357, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3540908135473728, + "avg_penalty/before_think": 0.6408871859312057, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 901.75, + "completions/max_terminated_length": 718.5, + "completions/mean_length": 342.3125, + "completions/mean_terminated_length": 321.50938415527344, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.1665, + "grad_norm": 4.345587253570557, + "kl": 14.7265625, + "learning_rate": 1.9735789028731603e-05, + "loss": 1.283, + "num_tokens": 14089430.0, + "reward": 1.07421875, + "reward_std": 0.9622233062982559, + "rewards/accuracy_reward/mean": NaN, + "rewards/accuracy_reward/std": NaN, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5143726766109467, + "rewards/tag_count_reward/mean": 0.57421875, + "rewards/tag_count_reward/std": 0.47801367193460464, + "step": 333, + "token_counts/after_target": 1195.5, + "token_counts/after_think": 82.5, + "token_counts/before_target": 2629.75, + "token_counts/before_think": 1569.25 + }, + { + "avg_penalty/after_target": 2.6612069606781006, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4488101676106453, + "avg_penalty/before_think": 0.7860033512115479, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 927.5, + "completions/max_terminated_length": 921.75, + "completions/mean_length": 410.5625, + "completions/mean_terminated_length": 382.5528869628906, + "completions/min_length": 123.5, + "completions/min_terminated_length": 123.5, + "epoch": 0.167, + "grad_norm": 7.720501899719238, + "kl": 9.1953125, + "learning_rate": 1.9731788727770885e-05, + "loss": 1.3314, + "num_tokens": 14126090.0, + "reward": 1.23046875, + "reward_std": 0.8140065371990204, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.515625, + "rewards/format_reward/std": 0.49654312431812286, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.38827090710401535, + "step": 334, + "token_counts/after_target": 1607.0, + "token_counts/after_think": 362.75, + "token_counts/before_target": 2278.25, + "token_counts/before_think": 2321.0 + }, + { + "avg_penalty/after_target": 2.2260827720165253, + "avg_penalty/after_think": 3.6209608912467957, + "avg_penalty/before_target": 0.3114341199398041, + "avg_penalty/before_think": 0.619412824511528, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 306.296875, + "completions/mean_terminated_length": 306.296875, + "completions/min_length": 94.25, + "completions/min_terminated_length": 94.25, + "epoch": 0.1675, + "grad_norm": 7.455537796020508, + "kl": 9.5859375, + "learning_rate": 1.972775878209397e-05, + "loss": 0.9962, + "num_tokens": 14154669.0, + "reward": 1.44140625, + "reward_std": 0.8171793222427368, + "rewards/accuracy_reward/mean": NaN, + "rewards/accuracy_reward/std": NaN, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.479247085750103, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.3653804026544094, + "step": 335, + "token_counts/after_target": 829.0, + "token_counts/after_think": 187.5, + "token_counts/before_target": 1878.0, + "token_counts/before_think": 2006.25 + }, + { + "avg_penalty/after_target": 2.7470075488090515, + "avg_penalty/after_think": 1.7375709414482117, + "avg_penalty/before_target": 0.42711444944143295, + "avg_penalty/before_think": 0.5020183399319649, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 831.5, + "completions/max_terminated_length": 683.25, + "completions/mean_length": 251.421875, + "completions/mean_terminated_length": 238.19375228881836, + "completions/min_length": 79.75, + "completions/min_terminated_length": 79.75, + "epoch": 0.168, + "grad_norm": 7.8843584060668945, + "kl": 24.140625, + "learning_rate": 1.9723699203976768e-05, + "loss": 1.9162, + "num_tokens": 14180216.0, + "reward": 1.0390625, + "reward_std": 0.8326003849506378, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.40625, + "rewards/format_reward/std": 0.4955305755138397, + "rewards/tag_count_reward/mean": 0.6328125, + "rewards/tag_count_reward/std": 0.4073299542069435, + "step": 336, + "token_counts/after_target": 948.5, + "token_counts/after_think": 17.25, + "token_counts/before_target": 2178.0, + "token_counts/before_think": 879.0 + }, + { + "avg_penalty/after_target": 3.0513362288475037, + "avg_penalty/after_think": 2.93764990568161, + "avg_penalty/before_target": 0.24337682127952576, + "avg_penalty/before_think": 0.5705467090010643, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 220.09375, + "completions/mean_terminated_length": 220.09375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.1685, + "grad_norm": 15.128037452697754, + "kl": 18.578125, + "learning_rate": 1.9719610005785466e-05, + "loss": 1.2548, + "num_tokens": 14203630.0, + "reward": 1.08984375, + "reward_std": 0.8440384119749069, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.4375, + "rewards/format_reward/std": 0.49776528775691986, + "rewards/tag_count_reward/mean": 0.65234375, + "rewards/tag_count_reward/std": 0.39887867122888565, + "step": 337, + "token_counts/after_target": 305.0, + "token_counts/after_think": 89.25, + "token_counts/before_target": 1960.5, + "token_counts/before_think": 1166.75 + }, + { + "avg_penalty/after_target": 3.417846918106079, + "avg_penalty/after_think": 2.7958643436431885, + "avg_penalty/before_target": 0.23085231333971024, + "avg_penalty/before_think": 0.46424172818660736, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.5, + "completions/max_terminated_length": 400.5, + "completions/mean_length": 193.203125, + "completions/mean_terminated_length": 193.203125, + "completions/min_length": 49.75, + "completions/min_terminated_length": 49.75, + "epoch": 0.169, + "grad_norm": 5.0102620124816895, + "kl": 15.65625, + "learning_rate": 1.9715491199976462e-05, + "loss": 1.2351, + "num_tokens": 14224027.0, + "reward": 1.06640625, + "reward_std": 0.8308160752058029, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.40625, + "rewards/format_reward/std": 0.4970766380429268, + "rewards/tag_count_reward/mean": 0.66015625, + "rewards/tag_count_reward/std": 0.4056064710021019, + "step": 338, + "token_counts/after_target": 435.75, + "token_counts/after_think": 31.25, + "token_counts/before_target": 1510.0, + "token_counts/before_think": 1114.25 + }, + { + "avg_penalty/after_target": 3.0996240973472595, + "avg_penalty/after_think": 3.413713425397873, + "avg_penalty/before_target": 0.23344016075134277, + "avg_penalty/before_think": 0.44283629953861237, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.5, + "completions/max_terminated_length": 481.5, + "completions/mean_length": 228.109375, + "completions/mean_terminated_length": 228.109375, + "completions/min_length": 76.25, + "completions/min_terminated_length": 76.25, + "epoch": 0.1695, + "grad_norm": 3.6025798320770264, + "kl": 13.4375, + "learning_rate": 1.971134279909636e-05, + "loss": 1.1293, + "num_tokens": 14248338.0, + "reward": 1.25, + "reward_std": 0.829290047287941, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5102732330560684, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.3546702712774277, + "step": 339, + "token_counts/after_target": 324.75, + "token_counts/after_think": 124.25, + "token_counts/before_target": 1909.0, + "token_counts/before_think": 1291.75 + }, + { + "avg_penalty/after_target": 2.59494948387146, + "avg_penalty/after_think": 2.9817970991134644, + "avg_penalty/before_target": 0.2507741190493107, + "avg_penalty/before_think": 0.6804739832878113, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 234.5, + "completions/mean_terminated_length": 234.5, + "completions/min_length": 64.75, + "completions/min_terminated_length": 64.75, + "epoch": 0.17, + "grad_norm": 4.611401557922363, + "kl": 12.99609375, + "learning_rate": 1.970716481578191e-05, + "loss": 1.2052, + "num_tokens": 14276018.0, + "reward": 1.390625, + "reward_std": 0.7282116785645485, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.466681070625782, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.32405900210142136, + "step": 340, + "token_counts/after_target": 487.25, + "token_counts/after_think": 83.0, + "token_counts/before_target": 1808.0, + "token_counts/before_think": 1373.75 + }, + { + "avg_penalty/after_target": 2.7762034833431244, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.36235102266073227, + "avg_penalty/before_think": 0.27034949138760567, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 689.0, + "completions/max_terminated_length": 689.0, + "completions/mean_length": 262.734375, + "completions/mean_terminated_length": 262.734375, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.1705, + "grad_norm": 8.346710205078125, + "kl": 22.6875, + "learning_rate": 1.9702957262759964e-05, + "loss": 1.8652, + "num_tokens": 14301745.0, + "reward": 0.69140625, + "reward_std": 0.641988568007946, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.171875, + "rewards/format_reward/std": 0.3683478757739067, + "rewards/tag_count_reward/mean": 0.51953125, + "rewards/tag_count_reward/std": 0.35222508758306503, + "step": 341, + "token_counts/after_target": 1081.5, + "token_counts/after_think": 25.25, + "token_counts/before_target": 2482.0, + "token_counts/before_think": 615.0 + }, + { + "avg_penalty/after_target": 2.6275689601898193, + "avg_penalty/after_think": 2.9650647044181824, + "avg_penalty/before_target": 0.4048202410340309, + "avg_penalty/before_think": 0.4943614974617958, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 648.0, + "completions/max_terminated_length": 558.75, + "completions/mean_length": 245.0625, + "completions/mean_terminated_length": 233.1750030517578, + "completions/min_length": 52.25, + "completions/min_terminated_length": 52.25, + "epoch": 0.171, + "grad_norm": 6.743316650390625, + "kl": 17.0, + "learning_rate": 1.969872015284747e-05, + "loss": 1.5821, + "num_tokens": 14326421.0, + "reward": 0.81640625, + "reward_std": 0.7007454037666321, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4440634250640869, + "rewards/tag_count_reward/mean": 0.56640625, + "rewards/tag_count_reward/std": 0.34182237461209297, + "step": 342, + "token_counts/after_target": 944.5, + "token_counts/after_think": 36.25, + "token_counts/before_target": 2134.25, + "token_counts/before_think": 806.0 + }, + { + "avg_penalty/after_target": 3.335149049758911, + "avg_penalty/after_think": 2.718540370464325, + "avg_penalty/before_target": 0.28506115078926086, + "avg_penalty/before_think": 0.4311120957136154, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.25, + "completions/max_terminated_length": 490.25, + "completions/mean_length": 212.203125, + "completions/mean_terminated_length": 212.203125, + "completions/min_length": 47.75, + "completions/min_terminated_length": 47.75, + "epoch": 0.1715, + "grad_norm": 7.291497707366943, + "kl": 13.6875, + "learning_rate": 1.9694453498951392e-05, + "loss": 1.2977, + "num_tokens": 14353234.0, + "reward": 1.03515625, + "reward_std": 0.8308381289243698, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.421875, + "rewards/format_reward/std": 0.48989029973745346, + "rewards/tag_count_reward/mean": 0.61328125, + "rewards/tag_count_reward/std": 0.3952305540442467, + "step": 343, + "token_counts/after_target": 587.0, + "token_counts/after_think": 32.5, + "token_counts/before_target": 1869.5, + "token_counts/before_think": 906.25 + }, + { + "avg_penalty/after_target": 2.636787235736847, + "avg_penalty/after_think": 2.834075093269348, + "avg_penalty/before_target": 0.34300781786441803, + "avg_penalty/before_think": 0.495687797665596, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.5, + "completions/max_terminated_length": 425.5, + "completions/mean_length": 189.390625, + "completions/mean_terminated_length": 189.390625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.172, + "grad_norm": 14.554754257202148, + "kl": 9.0703125, + "learning_rate": 1.9690157314068696e-05, + "loss": 1.1124, + "num_tokens": 14379195.0, + "reward": 1.1484375, + "reward_std": 0.8423186987638474, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.48456869274377823, + "rewards/tag_count_reward/mean": 0.6484375, + "rewards/tag_count_reward/std": 0.44066157191991806, + "step": 344, + "token_counts/after_target": 411.0, + "token_counts/after_think": 101.5, + "token_counts/before_target": 1780.0, + "token_counts/before_think": 737.75 + }, + { + "avg_penalty/after_target": 3.4943939447402954, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.2769079841673374, + "avg_penalty/before_think": 0.25222211331129074, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.75, + "completions/max_terminated_length": 342.75, + "completions/mean_length": 156.75, + "completions/mean_terminated_length": 156.75, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.1725, + "grad_norm": 6.437824726104736, + "kl": 15.59375, + "learning_rate": 1.9685831611286312e-05, + "loss": 1.3793, + "num_tokens": 14398811.0, + "reward": 1.0234375, + "reward_std": 0.922385647892952, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.453125, + "rewards/format_reward/std": 0.49244368076324463, + "rewards/tag_count_reward/mean": 0.5703125, + "rewards/tag_count_reward/std": 0.4729341045022011, + "step": 345, + "token_counts/after_target": 319.5, + "token_counts/after_think": 70.0, + "token_counts/before_target": 1485.75, + "token_counts/before_think": 632.75 + }, + { + "avg_penalty/after_target": 2.7471261024475098, + "avg_penalty/after_think": 3.780770242214203, + "avg_penalty/before_target": 0.25381653010845184, + "avg_penalty/before_think": 0.3830210417509079, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 138.1875, + "completions/mean_terminated_length": 138.1875, + "completions/min_length": 50.75, + "completions/min_terminated_length": 50.75, + "epoch": 0.173, + "grad_norm": 5.04265022277832, + "kl": 14.71875, + "learning_rate": 1.968147640378108e-05, + "loss": 1.1949, + "num_tokens": 14417783.0, + "reward": 1.60546875, + "reward_std": 0.7523674815893173, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.40263500809669495, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.37513044476509094, + "step": 346, + "token_counts/after_target": 221.25, + "token_counts/after_think": 41.75, + "token_counts/before_target": 1123.5, + "token_counts/before_think": 824.5 + }, + { + "avg_penalty/after_target": 3.2772931456565857, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.18734567984938622, + "avg_penalty/before_think": 0.2956553027033806, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.25, + "completions/max_terminated_length": 267.25, + "completions/mean_length": 146.4375, + "completions/mean_terminated_length": 146.4375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.1735, + "grad_norm": 17.954954147338867, + "kl": 20.515625, + "learning_rate": 1.9677091704819714e-05, + "loss": 1.3134, + "num_tokens": 14436051.0, + "reward": 1.30078125, + "reward_std": 0.8815110772848129, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.48296456038951874, + "rewards/tag_count_reward/mean": 0.67578125, + "rewards/tag_count_reward/std": 0.42935849726200104, + "step": 347, + "token_counts/after_target": 162.75, + "token_counts/after_think": 57.0, + "token_counts/before_target": 1407.75, + "token_counts/before_think": 715.5 + }, + { + "avg_penalty/after_target": 3.1682685017585754, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.21381483227014542, + "avg_penalty/before_think": 0.32117655873298645, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.25, + "completions/max_terminated_length": 353.25, + "completions/mean_length": 148.421875, + "completions/mean_terminated_length": 148.421875, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.174, + "grad_norm": 29.783184051513672, + "kl": 31.53125, + "learning_rate": 1.967267752775877e-05, + "loss": 1.7898, + "num_tokens": 14456014.0, + "reward": 1.234375, + "reward_std": 0.9457852840423584, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.50393907725811, + "rewards/tag_count_reward/mean": 0.640625, + "rewards/tag_count_reward/std": 0.4641121029853821, + "step": 348, + "token_counts/after_target": 236.0, + "token_counts/after_think": 19.0, + "token_counts/before_target": 1476.25, + "token_counts/before_think": 643.5 + }, + { + "avg_penalty/after_target": 2.993548035621643, + "avg_penalty/after_think": 3.981346905231476, + "avg_penalty/before_target": 0.3760824091732502, + "avg_penalty/before_think": 0.6321750432252884, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 547.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 189.46875, + "completions/mean_terminated_length": 176.45312881469727, + "completions/min_length": 54.75, + "completions/min_terminated_length": 54.75, + "epoch": 0.1745, + "grad_norm": 22.454591751098633, + "kl": 23.828125, + "learning_rate": 1.9668233886044597e-05, + "loss": 1.7931, + "num_tokens": 14478444.0, + "reward": 1.25390625, + "reward_std": 0.9490405470132828, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.5061737895011902, + "rewards/tag_count_reward/mean": 0.66015625, + "rewards/tag_count_reward/std": 0.4698772430419922, + "step": 349, + "token_counts/after_target": 533.25, + "token_counts/after_think": 77.5, + "token_counts/before_target": 1719.75, + "token_counts/before_think": 701.0 + }, + { + "avg_penalty/after_target": 3.086000680923462, + "avg_penalty/after_think": 2.5812626481056213, + "avg_penalty/before_target": 0.2935364730656147, + "avg_penalty/before_think": 0.3421177715063095, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.5, + "completions/max_terminated_length": 567.5, + "completions/mean_length": 184.578125, + "completions/mean_terminated_length": 184.578125, + "completions/min_length": 58.25, + "completions/min_terminated_length": 58.25, + "epoch": 0.175, + "grad_norm": 13.875299453735352, + "kl": 31.3125, + "learning_rate": 1.9663760793213297e-05, + "loss": 2.1184, + "num_tokens": 14500689.0, + "reward": 1.21875, + "reward_std": 0.9654181003570557, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.5018647313117981, + "rewards/tag_count_reward/mean": 0.625, + "rewards/tag_count_reward/std": 0.4887097403407097, + "step": 350, + "token_counts/after_target": 409.75, + "token_counts/after_think": 11.5, + "token_counts/before_target": 1983.75, + "token_counts/before_think": 548.25 + }, + { + "avg_penalty/after_target": 2.935806930065155, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.40082770586013794, + "avg_penalty/before_think": 0.3654246926307678, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 631.25, + "completions/max_terminated_length": 571.5, + "completions/mean_length": 209.109375, + "completions/mean_terminated_length": 196.89791870117188, + "completions/min_length": 79.25, + "completions/min_terminated_length": 79.25, + "epoch": 0.1755, + "grad_norm": 21.01880645751953, + "kl": 21.46875, + "learning_rate": 1.9659258262890683e-05, + "loss": 2.1071, + "num_tokens": 14522376.0, + "reward": 1.34765625, + "reward_std": 0.8852957785129547, + "rewards/accuracy_reward/mean": NaN, + "rewards/accuracy_reward/std": NaN, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.4797805994749069, + "rewards/tag_count_reward/mean": 0.69140625, + "rewards/tag_count_reward/std": 0.4453607350587845, + "step": 351, + "token_counts/after_target": 743.75, + "token_counts/after_think": 28.75, + "token_counts/before_target": 1984.0, + "token_counts/before_think": 589.25 + }, + { + "avg_penalty/after_target": 2.600678563117981, + "avg_penalty/after_think": 3.986051559448242, + "avg_penalty/before_target": 0.3053203225135803, + "avg_penalty/before_think": 0.3654201067984104, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 758.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 193.890625, + "completions/mean_terminated_length": 166.79479598999023, + "completions/min_length": 72.75, + "completions/min_terminated_length": 72.75, + "epoch": 0.176, + "grad_norm": 25.477439880371094, + "kl": 10.9609375, + "learning_rate": 1.9654726308792252e-05, + "loss": 1.5546, + "num_tokens": 14546961.0, + "reward": 1.69921875, + "reward_std": 0.6917185336351395, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.3723389655351639, + "rewards/tag_count_reward/mean": 0.85546875, + "rewards/tag_count_reward/std": 0.3459278345108032, + "step": 352, + "token_counts/after_target": 353.5, + "token_counts/after_think": 111.25, + "token_counts/before_target": 1842.5, + "token_counts/before_think": 795.0 + }, + { + "avg_penalty/after_target": 2.088748812675476, + "avg_penalty/after_think": 2.8241586685180664, + "avg_penalty/before_target": 0.5475023835897446, + "avg_penalty/before_think": 0.480647012591362, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 770.25, + "completions/max_terminated_length": 656.75, + "completions/mean_length": 262.0625, + "completions/mean_terminated_length": 209.109375, + "completions/min_length": 64.75, + "completions/min_terminated_length": 64.75, + "epoch": 0.1765, + "grad_norm": 25.645753860473633, + "kl": 17.09375, + "learning_rate": 1.9650164944723116e-05, + "loss": 2.0165, + "num_tokens": 14572997.0, + "reward": 1.390625, + "reward_std": 0.8606233298778534, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.4788651168346405, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.4252755269408226, + "step": 353, + "token_counts/after_target": 1062.0, + "token_counts/after_think": 60.0, + "token_counts/before_target": 2342.5, + "token_counts/before_think": 728.5 + }, + { + "avg_penalty/after_target": 2.017300099134445, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.7558761239051819, + "avg_penalty/before_think": 0.4614013321697712, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 950.25, + "completions/max_terminated_length": 888.5, + "completions/mean_length": 352.5, + "completions/mean_terminated_length": 317.91101837158203, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.177, + "grad_norm": 14.793201446533203, + "kl": 30.09375, + "learning_rate": 1.9645574184577982e-05, + "loss": 2.7788, + "num_tokens": 14605973.0, + "reward": 1.17578125, + "reward_std": 0.9622426778078079, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.49808918684720993, + "rewards/tag_count_reward/mean": 0.59765625, + "rewards/tag_count_reward/std": 0.4816601350903511, + "step": 354, + "token_counts/after_target": 2124.25, + "token_counts/after_think": 20.5, + "token_counts/before_target": 2936.25, + "token_counts/before_think": 559.0 + }, + { + "avg_penalty/after_target": 2.5238006711006165, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.7927010357379913, + "avg_penalty/before_think": 0.39170049875974655, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 1002.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 394.953125, + "completions/mean_terminated_length": 233.73317337036133, + "completions/min_length": 62.25, + "completions/min_terminated_length": 62.25, + "epoch": 0.1775, + "grad_norm": 5.108211994171143, + "kl": 37.9375, + "learning_rate": 1.96409540423411e-05, + "loss": 3.1084, + "num_tokens": 14640834.0, + "reward": 0.97265625, + "reward_std": 0.9610662162303925, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.4375, + "rewards/format_reward/std": 0.5059641748666763, + "rewards/tag_count_reward/mean": 0.51953125, + "rewards/tag_count_reward/std": 0.4803251847624779, + "step": 355, + "token_counts/after_target": 2376.75, + "token_counts/after_think": 14.75, + "token_counts/before_target": 3506.5, + "token_counts/before_think": 421.25 + }, + { + "avg_penalty/after_target": 2.945079743862152, + "avg_penalty/after_think": 1.998612403869629, + "avg_penalty/before_target": 0.8996576741337776, + "avg_penalty/before_think": 0.3335805758833885, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 889.75, + "completions/max_terminated_length": 667.5, + "completions/mean_length": 362.046875, + "completions/mean_terminated_length": 221.45880889892578, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.178, + "grad_norm": 8.968656539916992, + "kl": 47.84375, + "learning_rate": 1.963630453208623e-05, + "loss": 3.9308, + "num_tokens": 14675621.0, + "reward": 1.2109375, + "reward_std": 0.9677328169345856, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.5018647313117981, + "rewards/tag_count_reward/mean": 0.6171875, + "rewards/tag_count_reward/std": 0.47989528626203537, + "step": 356, + "token_counts/after_target": 2919.25, + "token_counts/after_think": 12.75, + "token_counts/before_target": 2273.0, + "token_counts/before_think": 587.75 + }, + { + "avg_penalty/after_target": 2.3691776394844055, + "avg_penalty/after_think": 2.823847770690918, + "avg_penalty/before_target": 0.8531104996800423, + "avg_penalty/before_think": 0.3173309192061424, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 332.171875, + "completions/mean_terminated_length": 166.07202911376953, + "completions/min_length": 63.5, + "completions/min_terminated_length": 63.5, + "epoch": 0.1785, + "grad_norm": 11.427400588989258, + "kl": 49.0, + "learning_rate": 1.9631625667976584e-05, + "loss": 3.88, + "num_tokens": 14706032.0, + "reward": 1.34375, + "reward_std": 0.8539705276489258, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.6875, + "rewards/tag_count_reward/std": 0.42698923498392105, + "step": 357, + "token_counts/after_target": 2315.5, + "token_counts/after_think": 25.5, + "token_counts/before_target": 2455.5, + "token_counts/before_think": 518.25 + }, + { + "avg_penalty/after_target": 2.615127444267273, + "avg_penalty/after_think": 3.933969795703888, + "avg_penalty/before_target": 0.726073831319809, + "avg_penalty/before_think": 0.4170229360461235, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 225.953125, + "completions/mean_terminated_length": 128.44103240966797, + "completions/min_length": 57.5, + "completions/min_terminated_length": 57.5, + "epoch": 0.179, + "grad_norm": 20.981475830078125, + "kl": 27.4375, + "learning_rate": 1.962691746426479e-05, + "loss": 2.8683, + "num_tokens": 14730285.0, + "reward": 1.625, + "reward_std": 0.7714012563228607, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.38336414843797684, + "step": 358, + "token_counts/after_target": 1084.5, + "token_counts/after_think": 53.25, + "token_counts/before_target": 1759.75, + "token_counts/before_think": 717.75 + }, + { + "avg_penalty/after_target": 2.907451868057251, + "avg_penalty/after_think": 2.939721465110779, + "avg_penalty/before_target": 0.31874367594718933, + "avg_penalty/before_think": 0.4965599328279495, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 439.0, + "completions/max_terminated_length": 328.5, + "completions/mean_length": 144.6875, + "completions/mean_terminated_length": 131.24062728881836, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.1795, + "grad_norm": 8.837929725646973, + "kl": 8.798828125, + "learning_rate": 1.9622179935292855e-05, + "loss": 1.0932, + "num_tokens": 14746777.0, + "reward": 1.6640625, + "reward_std": 0.6345293074846268, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3987511098384857, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.29259341210126877, + "step": 359, + "token_counts/after_target": 353.0, + "token_counts/after_think": 32.75, + "token_counts/before_target": 1153.25, + "token_counts/before_think": 776.0 + }, + { + "avg_penalty/after_target": 2.6160506308078766, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.500193364918232, + "avg_penalty/before_think": 0.4072412773966789, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 711.5, + "completions/max_terminated_length": 516.25, + "completions/mean_length": 201.140625, + "completions/mean_terminated_length": 161.96101760864258, + "completions/min_length": 68.5, + "completions/min_terminated_length": 68.5, + "epoch": 0.18, + "grad_norm": 17.66595458984375, + "kl": 16.96875, + "learning_rate": 1.9617413095492114e-05, + "loss": 1.9405, + "num_tokens": 14768626.0, + "reward": 1.65234375, + "reward_std": 0.678281843662262, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.85546875, + "rewards/tag_count_reward/std": 0.33054671436548233, + "step": 360, + "token_counts/after_target": 887.0, + "token_counts/after_think": 41.5, + "token_counts/before_target": 1401.5, + "token_counts/before_think": 888.25 + }, + { + "avg_penalty/after_target": 3.0590643286705017, + "avg_penalty/after_think": 1.9922000169754028, + "avg_penalty/before_target": 0.6193084567785263, + "avg_penalty/before_think": 0.380093764513731, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 845.5, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 225.4375, + "completions/mean_terminated_length": 173.58140563964844, + "completions/min_length": 29.75, + "completions/min_terminated_length": 29.75, + "epoch": 0.1805, + "grad_norm": 9.968870162963867, + "kl": 31.375, + "learning_rate": 1.961261695938319e-05, + "loss": 2.6719, + "num_tokens": 14794702.0, + "reward": 1.359375, + "reward_std": 0.8367255926132202, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.46326854079961777, + "rewards/tag_count_reward/mean": 0.71875, + "rewards/tag_count_reward/std": 0.4151313230395317, + "step": 361, + "token_counts/after_target": 1186.5, + "token_counts/after_think": 33.75, + "token_counts/before_target": 1731.0, + "token_counts/before_think": 655.75 + }, + { + "avg_penalty/after_target": 1.7128340005874634, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5563286356627941, + "avg_penalty/before_think": 0.40317726880311966, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 855.0, + "completions/max_terminated_length": 552.75, + "completions/mean_length": 239.875, + "completions/mean_terminated_length": 175.22244262695312, + "completions/min_length": 66.5, + "completions/min_terminated_length": 66.5, + "epoch": 0.181, + "grad_norm": 30.85233497619629, + "kl": 38.875, + "learning_rate": 1.9607791541575944e-05, + "loss": 2.6791, + "num_tokens": 14819718.0, + "reward": 1.47265625, + "reward_std": 0.7843271642923355, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.44495995342731476, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.37210510671138763, + "step": 362, + "token_counts/after_target": 914.25, + "token_counts/after_think": 39.5, + "token_counts/before_target": 2084.5, + "token_counts/before_think": 799.75 + }, + { + "avg_penalty/after_target": 2.041320115327835, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.47822510451078415, + "avg_penalty/before_think": 0.5391234159469604, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 723.25, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 214.125, + "completions/mean_terminated_length": 173.4035758972168, + "completions/min_length": 16.25, + "completions/min_terminated_length": 16.25, + "epoch": 0.1815, + "grad_norm": 41.891448974609375, + "kl": 44.125, + "learning_rate": 1.9602936856769432e-05, + "loss": 2.741, + "num_tokens": 14843550.0, + "reward": 1.265625, + "reward_std": 0.9039315432310104, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.5071863383054733, + "rewards/tag_count_reward/mean": 0.6875, + "rewards/tag_count_reward/std": 0.44149748980998993, + "step": 363, + "token_counts/after_target": 687.5, + "token_counts/after_think": 29.0, + "token_counts/before_target": 1954.25, + "token_counts/before_think": 755.25 + }, + { + "avg_penalty/after_target": 2.4173173308372498, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.38602636009454727, + "avg_penalty/before_think": 0.40666378289461136, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 468.25, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 172.0, + "completions/mean_terminated_length": 159.18542098999023, + "completions/min_length": 48.25, + "completions/min_terminated_length": 48.25, + "epoch": 0.182, + "grad_norm": 15.0731201171875, + "kl": 20.3203125, + "learning_rate": 1.959805291975187e-05, + "loss": 1.3762, + "num_tokens": 14863278.0, + "reward": 1.640625, + "reward_std": 0.7215436547994614, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4022643193602562, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.33657532930374146, + "step": 364, + "token_counts/after_target": 375.0, + "token_counts/after_think": 35.25, + "token_counts/before_target": 1351.0, + "token_counts/before_think": 990.75 + }, + { + "avg_penalty/after_target": 2.095641613006592, + "avg_penalty/after_think": 3.9293015599250793, + "avg_penalty/before_target": 0.48077940940856934, + "avg_penalty/before_think": 0.5397518202662468, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 815.25, + "completions/max_terminated_length": 610.0, + "completions/mean_length": 208.359375, + "completions/mean_terminated_length": 182.0885467529297, + "completions/min_length": 61.75, + "completions/min_terminated_length": 61.75, + "epoch": 0.1825, + "grad_norm": 8.624796867370605, + "kl": 28.21875, + "learning_rate": 1.9593139745400575e-05, + "loss": 2.223, + "num_tokens": 14887205.0, + "reward": 1.51953125, + "reward_std": 0.7891295850276947, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.46034691482782364, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.36837751418352127, + "step": 365, + "token_counts/after_target": 659.0, + "token_counts/after_think": 68.5, + "token_counts/before_target": 1783.5, + "token_counts/before_think": 822.75 + }, + { + "avg_penalty/after_target": 2.6792759001255035, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.423007570207119, + "avg_penalty/before_think": 0.331515334546566, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 611.5, + "completions/max_terminated_length": 396.25, + "completions/mean_length": 160.734375, + "completions/mean_terminated_length": 146.55000114440918, + "completions/min_length": 72.25, + "completions/min_terminated_length": 72.25, + "epoch": 0.183, + "grad_norm": 25.106908798217773, + "kl": 9.62109375, + "learning_rate": 1.958819734868193e-05, + "loss": 1.402, + "num_tokens": 14909252.0, + "reward": 1.84375, + "reward_std": 0.4296482056379318, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2561737820506096, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.17756596952676773, + "step": 366, + "token_counts/after_target": 395.75, + "token_counts/after_think": 42.25, + "token_counts/before_target": 1211.0, + "token_counts/before_think": 922.75 + }, + { + "avg_penalty/after_target": 3.2928434014320374, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.20731591433286667, + "avg_penalty/before_think": 0.3766041286289692, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 148.03125, + "completions/mean_terminated_length": 148.03125, + "completions/min_length": 74.5, + "completions/min_terminated_length": 74.5, + "epoch": 0.1835, + "grad_norm": 16.15721893310547, + "kl": 7.640625, + "learning_rate": 1.9583225744651334e-05, + "loss": 0.9991, + "num_tokens": 14928806.0, + "reward": 1.7734375, + "reward_std": 0.5825138092041016, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3265564441680908, + "rewards/tag_count_reward/mean": 0.8984375, + "rewards/tag_count_reward/std": 0.25785715878009796, + "step": 367, + "token_counts/after_target": 247.0, + "token_counts/after_think": 35.25, + "token_counts/before_target": 1175.75, + "token_counts/before_think": 910.5 + }, + { + "avg_penalty/after_target": 2.304462730884552, + "avg_penalty/after_think": 3.7434969544410706, + "avg_penalty/before_target": 0.32407769560813904, + "avg_penalty/before_think": 0.3702738806605339, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 146.828125, + "completions/mean_terminated_length": 146.828125, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.184, + "grad_norm": 11.783086776733398, + "kl": 6.73828125, + "learning_rate": 1.957822494845315e-05, + "loss": 0.8988, + "num_tokens": 14946667.0, + "reward": 1.70703125, + "reward_std": 0.6080164611339569, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.39476002007722855, + "rewards/tag_count_reward/mean": 0.91015625, + "rewards/tag_count_reward/std": 0.2630172595381737, + "step": 368, + "token_counts/after_target": 305.5, + "token_counts/after_think": 88.75, + "token_counts/before_target": 1120.0, + "token_counts/before_think": 835.0 + }, + { + "avg_penalty/after_target": 2.4367454946041107, + "avg_penalty/after_think": 3.8135042786598206, + "avg_penalty/before_target": 0.3079126738011837, + "avg_penalty/before_think": 0.4798278212547302, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.25, + "completions/max_terminated_length": 339.25, + "completions/mean_length": 155.6875, + "completions/mean_terminated_length": 155.6875, + "completions/min_length": 55.25, + "completions/min_terminated_length": 55.25, + "epoch": 0.1845, + "grad_norm": 3.608577251434326, + "kl": 7.2470703125, + "learning_rate": 1.9573194975320672e-05, + "loss": 0.803, + "num_tokens": 14966631.0, + "reward": 1.703125, + "reward_std": 0.5801121741533279, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.3890564441680908, + "rewards/tag_count_reward/mean": 0.90625, + "rewards/tag_count_reward/std": 0.22715851664543152, + "step": 369, + "token_counts/after_target": 203.5, + "token_counts/after_think": 75.0, + "token_counts/before_target": 1252.5, + "token_counts/before_think": 960.0 + }, + { + "avg_penalty/after_target": 2.874207943677902, + "avg_penalty/after_think": 2.7492939233779907, + "avg_penalty/before_target": 0.32986772805452347, + "avg_penalty/before_think": 0.32238437980413437, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 216.125, + "completions/mean_terminated_length": 216.125, + "completions/min_length": 56.75, + "completions/min_terminated_length": 56.75, + "epoch": 0.185, + "grad_norm": 28.181602478027344, + "kl": 35.84375, + "learning_rate": 1.956813584057608e-05, + "loss": 2.2775, + "num_tokens": 14991167.0, + "reward": 0.87890625, + "reward_std": 0.7161921858787537, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.265625, + "rewards/format_reward/std": 0.44187305867671967, + "rewards/tag_count_reward/mean": 0.61328125, + "rewards/tag_count_reward/std": 0.3333843946456909, + "step": 370, + "token_counts/after_target": 623.0, + "token_counts/after_think": 55.25, + "token_counts/before_target": 2243.25, + "token_counts/before_think": 536.5 + }, + { + "avg_penalty/after_target": 2.2812762558460236, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.48152797669172287, + "avg_penalty/before_think": 0.6379273235797882, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 738.75, + "completions/max_terminated_length": 581.75, + "completions/mean_length": 278.90625, + "completions/mean_terminated_length": 240.6742820739746, + "completions/min_length": 68.5, + "completions/min_terminated_length": 68.5, + "epoch": 0.1855, + "grad_norm": 40.608150482177734, + "kl": 45.375, + "learning_rate": 1.9563047559630356e-05, + "loss": 2.8257, + "num_tokens": 15023977.0, + "reward": 0.66015625, + "reward_std": 0.6190028488636017, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.15625, + "rewards/format_reward/std": 0.3723389655351639, + "rewards/tag_count_reward/mean": 0.50390625, + "rewards/tag_count_reward/std": 0.32469891756772995, + "step": 371, + "token_counts/after_target": 1249.25, + "token_counts/after_think": 45.75, + "token_counts/before_target": 2452.0, + "token_counts/before_think": 715.5 + }, + { + "avg_penalty/after_target": 2.384190618991852, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 0.48552603274583817, + "avg_penalty/before_think": 0.6160333901643753, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1012.75, + "completions/max_terminated_length": 804.25, + "completions/mean_length": 336.96875, + "completions/mean_terminated_length": 291.793758392334, + "completions/min_length": 44.25, + "completions/min_terminated_length": 44.25, + "epoch": 0.186, + "grad_norm": 34.79073715209961, + "kl": 40.875, + "learning_rate": 1.9557930147983303e-05, + "loss": 2.6089, + "num_tokens": 15055287.0, + "reward": 0.56640625, + "reward_std": 0.6083162724971771, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.33406074345111847, + "rewards/tag_count_reward/mean": 0.44140625, + "rewards/tag_count_reward/std": 0.3557855784893036, + "step": 372, + "token_counts/after_target": 1495.5, + "token_counts/after_think": 0.0, + "token_counts/before_target": 3530.25, + "token_counts/before_think": 365.75 + }, + { + "avg_penalty/after_target": 2.2392342388629913, + "avg_penalty/after_think": 2.781264007091522, + "avg_penalty/before_target": 0.44986751675605774, + "avg_penalty/before_think": 0.6810977384448051, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 889.75, + "completions/max_terminated_length": 715.0, + "completions/mean_length": 318.859375, + "completions/mean_terminated_length": 297.07813262939453, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.1865, + "grad_norm": 15.154046058654785, + "kl": 27.40625, + "learning_rate": 1.9552783621223437e-05, + "loss": 1.9989, + "num_tokens": 15084254.0, + "reward": 0.6328125, + "reward_std": 0.6311144977807999, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.15625, + "rewards/format_reward/std": 0.36797718703746796, + "rewards/tag_count_reward/mean": 0.4765625, + "rewards/tag_count_reward/std": 0.3352278992533684, + "step": 373, + "token_counts/after_target": 1462.75, + "token_counts/after_think": 41.0, + "token_counts/before_target": 2966.75, + "token_counts/before_think": 631.25 + }, + { + "avg_penalty/after_target": 2.9700928330421448, + "avg_penalty/after_think": 3.263495773077011, + "avg_penalty/before_target": 0.4396216943860054, + "avg_penalty/before_think": 0.5922013521194458, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 814.0, + "completions/max_terminated_length": 814.0, + "completions/mean_length": 284.765625, + "completions/mean_terminated_length": 284.765625, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.187, + "grad_norm": 14.023021697998047, + "kl": 15.984375, + "learning_rate": 1.954760799502798e-05, + "loss": 1.7376, + "num_tokens": 15114383.0, + "reward": 0.91796875, + "reward_std": 0.7876884341239929, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.328125, + "rewards/format_reward/std": 0.479247085750103, + "rewards/tag_count_reward/mean": 0.58984375, + "rewards/tag_count_reward/std": 0.3677973598241806, + "step": 374, + "token_counts/after_target": 1182.5, + "token_counts/after_think": 163.25, + "token_counts/before_target": 2400.5, + "token_counts/before_think": 810.0 + }, + { + "avg_penalty/after_target": 2.7585986852645874, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.46967898309230804, + "avg_penalty/before_think": 0.35476409643888474, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 623.75, + "completions/max_terminated_length": 583.25, + "completions/mean_length": 252.21875, + "completions/mean_terminated_length": 240.94479370117188, + "completions/min_length": 87.25, + "completions/min_terminated_length": 87.25, + "epoch": 0.1875, + "grad_norm": 15.294742584228516, + "kl": 11.8984375, + "learning_rate": 1.954240328516277e-05, + "loss": 1.4506, + "num_tokens": 15140877.0, + "reward": 1.0078125, + "reward_std": 0.7812246531248093, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.4697679653763771, + "rewards/tag_count_reward/mean": 0.6328125, + "rewards/tag_count_reward/std": 0.38155290484428406, + "step": 375, + "token_counts/after_target": 1018.0, + "token_counts/after_think": 100.25, + "token_counts/before_target": 2201.75, + "token_counts/before_think": 715.5 + }, + { + "avg_penalty/after_target": 2.6231605410575867, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4521188698709011, + "avg_penalty/before_think": 0.7772026136517525, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 643.75, + "completions/max_terminated_length": 643.75, + "completions/mean_length": 277.171875, + "completions/mean_terminated_length": 277.171875, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.188, + "grad_norm": 18.12105369567871, + "kl": 11.2109375, + "learning_rate": 1.953716950748227e-05, + "loss": 1.4909, + "num_tokens": 15172936.0, + "reward": 1.0625, + "reward_std": 0.8642528653144836, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.453125, + "rewards/format_reward/std": 0.49244368076324463, + "rewards/tag_count_reward/mean": 0.609375, + "rewards/tag_count_reward/std": 0.41865578293800354, + "step": 376, + "token_counts/after_target": 1135.25, + "token_counts/after_think": 139.75, + "token_counts/before_target": 2388.5, + "token_counts/before_think": 771.25 + }, + { + "avg_penalty/after_target": 2.193263053894043, + "avg_penalty/after_think": 1.7622202634811401, + "avg_penalty/before_target": 0.47231484204530716, + "avg_penalty/before_think": 0.39263857901096344, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 595.25, + "completions/max_terminated_length": 595.25, + "completions/mean_length": 264.328125, + "completions/mean_terminated_length": 264.328125, + "completions/min_length": 99.25, + "completions/min_terminated_length": 99.25, + "epoch": 0.1885, + "grad_norm": 13.807279586791992, + "kl": 10.5703125, + "learning_rate": 1.9531906677929472e-05, + "loss": 1.2466, + "num_tokens": 15198813.0, + "reward": 1.14453125, + "reward_std": 0.8990539759397507, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.48456869274377823, + "rewards/tag_count_reward/mean": 0.62890625, + "rewards/tag_count_reward/std": 0.42949824780225754, + "step": 377, + "token_counts/after_target": 954.5, + "token_counts/after_think": 31.25, + "token_counts/before_target": 2286.75, + "token_counts/before_think": 956.75 + }, + { + "avg_penalty/after_target": 2.62151563167572, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3711348921060562, + "avg_penalty/before_think": 0.4059109538793564, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 669.0, + "completions/max_terminated_length": 669.0, + "completions/mean_length": 257.578125, + "completions/mean_terminated_length": 257.578125, + "completions/min_length": 76.5, + "completions/min_terminated_length": 76.5, + "epoch": 0.189, + "grad_norm": 5.446249008178711, + "kl": 17.09375, + "learning_rate": 1.9526614812535866e-05, + "loss": 1.5748, + "num_tokens": 15225474.0, + "reward": 1.0234375, + "reward_std": 0.8326354026794434, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.40625, + "rewards/format_reward/std": 0.48456869274377823, + "rewards/tag_count_reward/mean": 0.6171875, + "rewards/tag_count_reward/std": 0.4096052795648575, + "step": 378, + "token_counts/after_target": 632.25, + "token_counts/after_think": 31.0, + "token_counts/before_target": 2765.5, + "token_counts/before_think": 692.5 + }, + { + "avg_penalty/after_target": 2.504075735807419, + "avg_penalty/after_think": 3.4490741789340973, + "avg_penalty/before_target": 0.29613249376416206, + "avg_penalty/before_think": 0.5187743753194809, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 570.75, + "completions/max_terminated_length": 570.75, + "completions/mean_length": 240.796875, + "completions/mean_terminated_length": 240.796875, + "completions/min_length": 94.5, + "completions/min_terminated_length": 94.5, + "epoch": 0.1895, + "grad_norm": 6.492173671722412, + "kl": 20.65625, + "learning_rate": 1.9521293927421388e-05, + "loss": 1.5787, + "num_tokens": 15251253.0, + "reward": 1.0625, + "reward_std": 0.9017345458269119, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.46875, + "rewards/format_reward/std": 0.5018647313117981, + "rewards/tag_count_reward/mean": 0.59375, + "rewards/tag_count_reward/std": 0.4480937793850899, + "step": 379, + "token_counts/after_target": 583.25, + "token_counts/after_think": 115.75, + "token_counts/before_target": 2266.0, + "token_counts/before_think": 887.75 + }, + { + "avg_penalty/after_target": 3.1816319823265076, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.28982945904135704, + "avg_penalty/before_think": 0.4344577118754387, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.25, + "completions/max_terminated_length": 455.25, + "completions/mean_length": 242.203125, + "completions/mean_terminated_length": 242.203125, + "completions/min_length": 92.5, + "completions/min_terminated_length": 92.5, + "epoch": 0.19, + "grad_norm": 13.89133071899414, + "kl": 24.703125, + "learning_rate": 1.9515944038794384e-05, + "loss": 1.6874, + "num_tokens": 15278210.0, + "reward": 1.0625, + "reward_std": 0.9116902351379395, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.46875, + "rewards/format_reward/std": 0.5061737895011902, + "rewards/tag_count_reward/mean": 0.59375, + "rewards/tag_count_reward/std": 0.4373352453112602, + "step": 380, + "token_counts/after_target": 606.75, + "token_counts/after_think": 69.0, + "token_counts/before_target": 2280.25, + "token_counts/before_think": 919.25 + }, + { + "avg_penalty/after_target": 2.6917665004730225, + "avg_penalty/after_think": 2.9862343668937683, + "avg_penalty/before_target": 0.39844587445259094, + "avg_penalty/before_think": 0.5186319947242737, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.25, + "completions/max_terminated_length": 430.25, + "completions/mean_length": 224.15625, + "completions/mean_terminated_length": 224.15625, + "completions/min_length": 83.75, + "completions/min_terminated_length": 83.75, + "epoch": 0.1905, + "grad_norm": 14.49797248840332, + "kl": 28.0625, + "learning_rate": 1.9510565162951538e-05, + "loss": 1.955, + "num_tokens": 15300428.0, + "reward": 1.08984375, + "reward_std": 0.9395835548639297, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.484375, + "rewards/format_reward/std": 0.5112857818603516, + "rewards/tag_count_reward/mean": 0.60546875, + "rewards/tag_count_reward/std": 0.46651536226272583, + "step": 381, + "token_counts/after_target": 635.0, + "token_counts/after_think": 91.5, + "token_counts/before_target": 1960.5, + "token_counts/before_think": 899.5 + }, + { + "avg_penalty/after_target": 3.248553454875946, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.3686991333961487, + "avg_penalty/before_think": 0.42502912878990173, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.75, + "completions/max_terminated_length": 508.75, + "completions/mean_length": 248.265625, + "completions/mean_terminated_length": 248.265625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.191, + "grad_norm": 5.329586982727051, + "kl": 24.75, + "learning_rate": 1.950515731627784e-05, + "loss": 1.7696, + "num_tokens": 15326029.0, + "reward": 1.046875, + "reward_std": 0.9328960627317429, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.484375, + "rewards/format_reward/std": 0.4896806851029396, + "rewards/tag_count_reward/mean": 0.5625, + "rewards/tag_count_reward/std": 0.4741537347435951, + "step": 382, + "token_counts/after_target": 802.75, + "token_counts/after_think": 12.5, + "token_counts/before_target": 2110.75, + "token_counts/before_think": 1046.25 + }, + { + "avg_penalty/after_target": 2.5857649445533752, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3819134831428528, + "avg_penalty/before_think": 0.4075982943177223, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.75, + "completions/max_terminated_length": 490.75, + "completions/mean_length": 265.015625, + "completions/mean_terminated_length": 265.015625, + "completions/min_length": 111.25, + "completions/min_terminated_length": 111.25, + "epoch": 0.1915, + "grad_norm": 5.268880844116211, + "kl": 18.734375, + "learning_rate": 1.9499720515246524e-05, + "loss": 1.489, + "num_tokens": 15351086.0, + "reward": 1.28515625, + "reward_std": 0.917900025844574, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.498777836561203, + "rewards/tag_count_reward/mean": 0.67578125, + "rewards/tag_count_reward/std": 0.4373384267091751, + "step": 383, + "token_counts/after_target": 822.25, + "token_counts/after_think": 110.0, + "token_counts/before_target": 2291.0, + "token_counts/before_think": 1017.0 + }, + { + "avg_penalty/after_target": 2.939229369163513, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.30905338004231453, + "avg_penalty/before_think": 0.4027370400726795, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.5, + "completions/max_terminated_length": 463.5, + "completions/mean_length": 259.015625, + "completions/mean_terminated_length": 259.015625, + "completions/min_length": 138.75, + "completions/min_terminated_length": 138.75, + "epoch": 0.192, + "grad_norm": 25.607656478881836, + "kl": 23.4375, + "learning_rate": 1.949425477641904e-05, + "loss": 1.4742, + "num_tokens": 15377343.0, + "reward": 0.64453125, + "reward_std": 0.5882245451211929, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.140625, + "rewards/format_reward/std": 0.3375816270709038, + "rewards/tag_count_reward/mean": 0.50390625, + "rewards/tag_count_reward/std": 0.3120284602046013, + "step": 384, + "token_counts/after_target": 610.75, + "token_counts/after_think": 120.25, + "token_counts/before_target": 2359.5, + "token_counts/before_think": 1053.75 + }, + { + "avg_penalty/after_target": 2.430389255285263, + "avg_penalty/after_think": 2.9132847785949707, + "avg_penalty/before_target": 0.3004634901881218, + "avg_penalty/before_think": 0.5570531338453293, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.75, + "completions/max_terminated_length": 494.75, + "completions/mean_length": 243.96875, + "completions/mean_terminated_length": 243.96875, + "completions/min_length": 62.75, + "completions/min_terminated_length": 62.75, + "epoch": 0.1925, + "grad_norm": 7.2882490158081055, + "kl": 13.7265625, + "learning_rate": 1.9488760116444966e-05, + "loss": 1.0557, + "num_tokens": 15406973.0, + "reward": 0.796875, + "reward_std": 0.767437607049942, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.3125, + "rewards/format_reward/std": 0.44495995342731476, + "rewards/tag_count_reward/mean": 0.484375, + "rewards/tag_count_reward/std": 0.3615671545267105, + "step": 385, + "token_counts/after_target": 502.0, + "token_counts/after_think": 134.0, + "token_counts/before_target": 2203.25, + "token_counts/before_think": 1064.25 + }, + { + "avg_penalty/after_target": 2.481335073709488, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3989144191145897, + "avg_penalty/before_think": 0.6331415623426437, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 827.0, + "completions/max_terminated_length": 704.25, + "completions/mean_length": 354.640625, + "completions/mean_terminated_length": 344.01458740234375, + "completions/min_length": 127.75, + "completions/min_terminated_length": 127.75, + "epoch": 0.193, + "grad_norm": 21.397537231445312, + "kl": 8.984375, + "learning_rate": 1.9483236552061996e-05, + "loss": 1.2163, + "num_tokens": 15438022.0, + "reward": 1.28125, + "reward_std": 0.858681783080101, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.48605145514011383, + "rewards/tag_count_reward/mean": 0.703125, + "rewards/tag_count_reward/std": 0.41823451966047287, + "step": 386, + "token_counts/after_target": 1272.25, + "token_counts/after_think": 74.5, + "token_counts/before_target": 2426.25, + "token_counts/before_think": 1901.25 + }, + { + "avg_penalty/after_target": 1.7608994841575623, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5034435242414474, + "avg_penalty/before_think": 0.7900138944387436, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 951.0, + "completions/max_terminated_length": 918.75, + "completions/mean_length": 490.625, + "completions/mean_terminated_length": 475.35731506347656, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.1935, + "grad_norm": 13.078681945800781, + "kl": 3.25146484375, + "learning_rate": 1.947768410009586e-05, + "loss": 0.9537, + "num_tokens": 15477406.0, + "reward": 1.4609375, + "reward_std": 0.7603407353162766, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.45565588772296906, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.3318638876080513, + "step": 387, + "token_counts/after_target": 1601.0, + "token_counts/after_think": 510.0, + "token_counts/before_target": 2108.75, + "token_counts/before_think": 3630.25 + }, + { + "avg_penalty/after_target": 2.719365417957306, + "avg_penalty/after_think": 3.2392625212669373, + "avg_penalty/before_target": 0.550871379673481, + "avg_penalty/before_think": 0.6551290452480316, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 931.75, + "completions/max_terminated_length": 903.0, + "completions/mean_length": 483.609375, + "completions/mean_terminated_length": 453.14576721191406, + "completions/min_length": 154.75, + "completions/min_terminated_length": 154.75, + "epoch": 0.194, + "grad_norm": 24.56783676147461, + "kl": 3.92578125, + "learning_rate": 1.9472102777460292e-05, + "loss": 1.2326, + "num_tokens": 15517093.0, + "reward": 1.421875, + "reward_std": 0.8200534731149673, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.48989029973745346, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.3740072622895241, + "step": 388, + "token_counts/after_target": 1846.75, + "token_counts/after_think": 343.5, + "token_counts/before_target": 2096.0, + "token_counts/before_think": 3451.5 + }, + { + "avg_penalty/after_target": 2.202809900045395, + "avg_penalty/after_think": 3.9068379998207092, + "avg_penalty/before_target": 0.4070561081171036, + "avg_penalty/before_think": 0.5845881775021553, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 704.25, + "completions/max_terminated_length": 702.75, + "completions/mean_length": 394.453125, + "completions/mean_terminated_length": 378.27679443359375, + "completions/min_length": 170.75, + "completions/min_terminated_length": 170.75, + "epoch": 0.1945, + "grad_norm": 13.286404609680176, + "kl": 4.43359375, + "learning_rate": 1.9466492601156964e-05, + "loss": 0.8773, + "num_tokens": 15551794.0, + "reward": 1.59375, + "reward_std": 0.7483254820108414, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4260597825050354, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.3611627370119095, + "step": 389, + "token_counts/after_target": 989.0, + "token_counts/after_think": 311.75, + "token_counts/before_target": 2266.5, + "token_counts/before_think": 2744.0 + }, + { + "avg_penalty/after_target": 2.3823397159576416, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.43545298278331757, + "avg_penalty/before_think": 0.6981411576271057, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 769.5, + "completions/max_terminated_length": 769.0, + "completions/mean_length": 452.984375, + "completions/mean_terminated_length": 432.99400329589844, + "completions/min_length": 184.25, + "completions/min_terminated_length": 184.25, + "epoch": 0.195, + "grad_norm": 10.465078353881836, + "kl": 7.0703125, + "learning_rate": 1.9460853588275454e-05, + "loss": 1.0032, + "num_tokens": 15591713.0, + "reward": 1.29296875, + "reward_std": 0.8501919507980347, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.48866813629865646, + "rewards/tag_count_reward/mean": 0.69921875, + "rewards/tag_count_reward/std": 0.39495618641376495, + "step": 390, + "token_counts/after_target": 1540.75, + "token_counts/after_think": 226.75, + "token_counts/before_target": 2340.75, + "token_counts/before_think": 3139.5 + }, + { + "avg_penalty/after_target": 2.1648833751678467, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4769439995288849, + "avg_penalty/before_think": 0.5671475753188133, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.0, + "completions/max_terminated_length": 763.0, + "completions/mean_length": 385.671875, + "completions/mean_terminated_length": 385.671875, + "completions/min_length": 139.5, + "completions/min_terminated_length": 139.5, + "epoch": 0.1955, + "grad_norm": 13.041102409362793, + "kl": 11.3203125, + "learning_rate": 1.945518575599317e-05, + "loss": 1.2524, + "num_tokens": 15626764.0, + "reward": 1.29296875, + "reward_std": 0.8487758040428162, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.4665650501847267, + "rewards/tag_count_reward/mean": 0.69921875, + "rewards/tag_count_reward/std": 0.41439494490623474, + "step": 391, + "token_counts/after_target": 1349.25, + "token_counts/after_think": 248.25, + "token_counts/before_target": 2188.0, + "token_counts/before_think": 2385.25 + }, + { + "avg_penalty/after_target": 2.0659850537776947, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.6327366232872009, + "avg_penalty/before_think": 0.6037076562643051, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 933.25, + "completions/max_terminated_length": 879.25, + "completions/mean_length": 421.140625, + "completions/mean_terminated_length": 402.5291748046875, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.196, + "grad_norm": 13.989801406860352, + "kl": 25.625, + "learning_rate": 1.944948912157531e-05, + "loss": 2.0511, + "num_tokens": 15663813.0, + "reward": 0.984375, + "reward_std": 0.8508637100458145, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.49776528775691986, + "rewards/tag_count_reward/mean": 0.609375, + "rewards/tag_count_reward/std": 0.40660834312438965, + "step": 392, + "token_counts/after_target": 2010.5, + "token_counts/after_think": 186.0, + "token_counts/before_target": 2875.5, + "token_counts/before_think": 1666.25 + }, + { + "avg_penalty/after_target": 2.8351784348487854, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.41368740797042847, + "avg_penalty/before_think": 0.6513979285955429, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 787.25, + "completions/max_terminated_length": 733.5, + "completions/mean_length": 349.125, + "completions/mean_terminated_length": 328.44419860839844, + "completions/min_length": 73.75, + "completions/min_terminated_length": 73.75, + "epoch": 0.1965, + "grad_norm": 67.65618896484375, + "kl": 53.0625, + "learning_rate": 1.944376370237481e-05, + "loss": 2.9231, + "num_tokens": 15695405.0, + "reward": 0.6484375, + "reward_std": 0.55937834456563, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.171875, + "rewards/format_reward/std": 0.2913651168346405, + "rewards/tag_count_reward/mean": 0.4765625, + "rewards/tag_count_reward/std": 0.3204677551984787, + "step": 393, + "token_counts/after_target": 1196.5, + "token_counts/after_think": 80.0, + "token_counts/before_target": 2831.0, + "token_counts/before_think": 1478.5 + }, + { + "avg_penalty/after_target": 2.5845134258270264, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 0.5907747745513916, + "avg_penalty/before_think": 1.0065215453505516, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 805.25, + "completions/max_terminated_length": 765.0, + "completions/mean_length": 402.09375, + "completions/mean_terminated_length": 338.02679443359375, + "completions/min_length": 35.75, + "completions/min_terminated_length": 35.75, + "epoch": 0.197, + "grad_norm": 90.8973159790039, + "kl": 69.75, + "learning_rate": 1.9438009515832298e-05, + "loss": 3.6332, + "num_tokens": 15736563.0, + "reward": 0.26953125, + "reward_std": 0.41582244262099266, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.1875, + "rewards/tag_count_reward/mean": 0.22265625, + "rewards/tag_count_reward/std": 0.2704908587038517, + "step": 394, + "token_counts/after_target": 2384.25, + "token_counts/after_think": 0.0, + "token_counts/before_target": 3245.5, + "token_counts/before_think": 803.75 + }, + { + "avg_penalty/after_target": 2.7188538908958435, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.6446268707513809, + "avg_penalty/before_think": 0.41607170179486275, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 836.25, + "completions/mean_length": 424.921875, + "completions/mean_terminated_length": 374.039306640625, + "completions/min_length": 61.25, + "completions/min_terminated_length": 61.25, + "epoch": 0.1975, + "grad_norm": 83.91600799560547, + "kl": 70.625, + "learning_rate": 1.943222657947601e-05, + "loss": 4.0098, + "num_tokens": 15775438.0, + "reward": 0.2890625, + "reward_std": 0.2806224003434181, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.2734375, + "rewards/tag_count_reward/std": 0.2413942888379097, + "step": 395, + "token_counts/after_target": 2617.25, + "token_counts/after_think": 151.25, + "token_counts/before_target": 3460.0, + "token_counts/before_think": 570.25 + }, + { + "avg_penalty/after_target": 1.6794028580188751, + "avg_penalty/after_think": 2.8157498240470886, + "avg_penalty/before_target": 0.5065840035676956, + "avg_penalty/before_think": 1.2222695350646973, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 901.0, + "completions/max_terminated_length": 857.25, + "completions/mean_length": 387.75, + "completions/mean_terminated_length": 376.6031265258789, + "completions/min_length": 32.75, + "completions/min_terminated_length": 32.75, + "epoch": 0.198, + "grad_norm": 91.15310668945312, + "kl": 66.25, + "learning_rate": 1.9426414910921785e-05, + "loss": 3.3235, + "num_tokens": 15812382.0, + "reward": 0.30078125, + "reward_std": 0.3591059669852257, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.26953125, + "rewards/tag_count_reward/std": 0.2723317816853523, + "step": 396, + "token_counts/after_target": 1797.5, + "token_counts/after_think": 87.0, + "token_counts/before_target": 3571.75, + "token_counts/before_think": 747.75 + }, + { + "avg_penalty/after_target": 2.190975606441498, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.4441869184374809, + "avg_penalty/before_think": 0.757946290075779, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 633.0, + "completions/max_terminated_length": 633.0, + "completions/mean_length": 284.140625, + "completions/mean_terminated_length": 284.140625, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.1985, + "grad_norm": 80.2612533569336, + "kl": 58.25, + "learning_rate": 1.942057452787297e-05, + "loss": 2.8317, + "num_tokens": 15841463.0, + "reward": 0.390625, + "reward_std": 0.31479593366384506, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.375, + "rewards/tag_count_reward/std": 0.27809376269578934, + "step": 397, + "token_counts/after_target": 1160.25, + "token_counts/after_think": 40.5, + "token_counts/before_target": 2740.5, + "token_counts/before_think": 605.0 + }, + { + "avg_penalty/after_target": 2.950926899909973, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.2997967004776001, + "avg_penalty/before_think": 0.44991714507341385, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.75, + "completions/max_terminated_length": 518.75, + "completions/mean_length": 223.890625, + "completions/mean_terminated_length": 223.890625, + "completions/min_length": 36.25, + "completions/min_terminated_length": 36.25, + "epoch": 0.199, + "grad_norm": 58.85841751098633, + "kl": 41.75, + "learning_rate": 1.941470544812038e-05, + "loss": 2.0848, + "num_tokens": 15866176.0, + "reward": 0.38671875, + "reward_std": 0.31911618262529373, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.37109375, + "rewards/tag_count_reward/std": 0.27642056718468666, + "step": 398, + "token_counts/after_target": 538.0, + "token_counts/after_think": 29.0, + "token_counts/before_target": 2424.75, + "token_counts/before_think": 590.5 + }, + { + "avg_penalty/after_target": 2.5164762437343597, + "avg_penalty/after_think": 2.9790832102298737, + "avg_penalty/before_target": 0.31539979204535484, + "avg_penalty/before_think": 0.6446729749441147, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 787.5, + "completions/max_terminated_length": 684.75, + "completions/mean_length": 255.953125, + "completions/mean_terminated_length": 242.90104293823242, + "completions/min_length": 21.75, + "completions/min_terminated_length": 21.75, + "epoch": 0.1995, + "grad_norm": 18.959447860717773, + "kl": 32.0, + "learning_rate": 1.9408807689542257e-05, + "loss": 2.1173, + "num_tokens": 15891645.0, + "reward": 0.5234375, + "reward_std": 0.46494656428694725, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.2257782220840454, + "rewards/tag_count_reward/mean": 0.4453125, + "rewards/tag_count_reward/std": 0.3121427148580551, + "step": 399, + "token_counts/after_target": 682.75, + "token_counts/after_think": 178.25, + "token_counts/before_target": 2390.0, + "token_counts/before_think": 844.25 + }, + { + "avg_penalty/after_target": 3.139439642429352, + "avg_penalty/after_think": 3.6754931807518005, + "avg_penalty/before_target": 0.21484173089265823, + "avg_penalty/before_think": 0.4552314728498459, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.5, + "completions/max_terminated_length": 376.5, + "completions/mean_length": 168.515625, + "completions/mean_terminated_length": 168.515625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.2, + "grad_norm": 11.955544471740723, + "kl": 16.875, + "learning_rate": 1.940288127010419e-05, + "loss": 1.1445, + "num_tokens": 15911374.0, + "reward": 0.81640625, + "reward_std": 0.6620963215827942, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.21875, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.59765625, + "rewards/tag_count_reward/std": 0.32322969287633896, + "step": 400, + "token_counts/after_target": 379.75, + "token_counts/after_think": 61.75, + "token_counts/before_target": 1608.75, + "token_counts/before_think": 646.0 + }, + { + "avg_penalty/after_target": 2.8325560092926025, + "avg_penalty/after_think": 3.9573800563812256, + "avg_penalty/before_target": 0.28259362652897835, + "avg_penalty/before_think": 0.3986247330904007, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.75, + "completions/max_terminated_length": 386.75, + "completions/mean_length": 180.6875, + "completions/mean_terminated_length": 180.6875, + "completions/min_length": 47.25, + "completions/min_terminated_length": 47.25, + "epoch": 0.2005, + "grad_norm": 11.698884010314941, + "kl": 6.6953125, + "learning_rate": 1.9396926207859085e-05, + "loss": 0.8684, + "num_tokens": 15936298.0, + "reward": 1.3828125, + "reward_std": 0.778541699051857, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.5040994435548782, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.31058450043201447, + "step": 401, + "token_counts/after_target": 281.0, + "token_counts/after_think": 239.75, + "token_counts/before_target": 1142.5, + "token_counts/before_think": 1227.75 + }, + { + "avg_penalty/after_target": 2.9773448705673218, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.26607896760106087, + "avg_penalty/before_think": 0.3887578323483467, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 179.875, + "completions/mean_terminated_length": 179.875, + "completions/min_length": 64.75, + "completions/min_terminated_length": 64.75, + "epoch": 0.201, + "grad_norm": 12.455462455749512, + "kl": 3.9140625, + "learning_rate": 1.939094252094709e-05, + "loss": 0.6907, + "num_tokens": 15959282.0, + "reward": 1.53515625, + "reward_std": 0.7307926267385483, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.479247085750103, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.26355382427573204, + "step": 402, + "token_counts/after_target": 368.75, + "token_counts/after_think": 62.0, + "token_counts/before_target": 1279.5, + "token_counts/before_think": 1167.75 + }, + { + "avg_penalty/after_target": 2.107849419116974, + "avg_penalty/after_think": 3.899596393108368, + "avg_penalty/before_target": 0.3538198694586754, + "avg_penalty/before_think": 0.5702679790556431, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.5, + "completions/max_terminated_length": 540.5, + "completions/mean_length": 230.34375, + "completions/mean_terminated_length": 230.34375, + "completions/min_length": 103.5, + "completions/min_terminated_length": 103.5, + "epoch": 0.2015, + "grad_norm": 21.03863525390625, + "kl": 2.75, + "learning_rate": 1.938493022759556e-05, + "loss": 0.8763, + "num_tokens": 15984024.0, + "reward": 1.58984375, + "reward_std": 0.7106468230485916, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4682852029800415, + "rewards/tag_count_reward/mean": 0.85546875, + "rewards/tag_count_reward/std": 0.24495066329836845, + "step": 403, + "token_counts/after_target": 622.25, + "token_counts/after_think": 150.5, + "token_counts/before_target": 1469.75, + "token_counts/before_think": 1443.0 + }, + { + "avg_penalty/after_target": 3.001355230808258, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.2670205868780613, + "avg_penalty/before_think": 0.5154132768511772, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 235.65625, + "completions/mean_terminated_length": 235.65625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.202, + "grad_norm": 19.558563232421875, + "kl": 2.09375, + "learning_rate": 1.937888934611898e-05, + "loss": 0.8763, + "num_tokens": 16012370.0, + "reward": 1.66015625, + "reward_std": 0.6760948151350021, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.42080147564411163, + "rewards/tag_count_reward/mean": 0.87890625, + "rewards/tag_count_reward/std": 0.2817244939506054, + "step": 404, + "token_counts/after_target": 560.25, + "token_counts/after_think": 220.25, + "token_counts/before_target": 1460.5, + "token_counts/before_think": 1529.5 + }, + { + "avg_penalty/after_target": 2.523833751678467, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.2625508978962898, + "avg_penalty/before_think": 0.4902992621064186, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.75, + "completions/max_terminated_length": 467.75, + "completions/mean_length": 228.578125, + "completions/mean_terminated_length": 228.578125, + "completions/min_length": 105.5, + "completions/min_terminated_length": 105.5, + "epoch": 0.2025, + "grad_norm": 18.076353073120117, + "kl": 2.3828125, + "learning_rate": 1.937281989491892e-05, + "loss": 0.8025, + "num_tokens": 16036983.0, + "reward": 1.78125, + "reward_std": 0.4644879847764969, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.3758598491549492, + "rewards/tag_count_reward/mean": 0.953125, + "rewards/tag_count_reward/std": 0.11365434341132641, + "step": 405, + "token_counts/after_target": 296.0, + "token_counts/after_think": 215.0, + "token_counts/before_target": 1641.75, + "token_counts/before_think": 1504.5 + }, + { + "avg_penalty/after_target": 2.431214988231659, + "avg_penalty/after_think": 3.0822452306747437, + "avg_penalty/before_target": 0.2802673690021038, + "avg_penalty/before_think": 0.3918134793639183, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.75, + "completions/max_terminated_length": 457.75, + "completions/mean_length": 219.0625, + "completions/mean_terminated_length": 219.0625, + "completions/min_length": 102.25, + "completions/min_terminated_length": 102.25, + "epoch": 0.203, + "grad_norm": 12.364306449890137, + "kl": 3.6953125, + "learning_rate": 1.9366721892483976e-05, + "loss": 0.6726, + "num_tokens": 16061403.0, + "reward": 1.57421875, + "reward_std": 0.6790300011634827, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4713720977306366, + "rewards/tag_count_reward/mean": 0.87109375, + "rewards/tag_count_reward/std": 0.2554726228117943, + "step": 406, + "token_counts/after_target": 295.0, + "token_counts/after_think": 166.25, + "token_counts/before_target": 1487.5, + "token_counts/before_think": 1556.25 + }, + { + "avg_penalty/after_target": 2.1893779635429382, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.39696843922138214, + "avg_penalty/before_think": 0.5299946367740631, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 750.25, + "completions/max_terminated_length": 606.5, + "completions/mean_length": 264.1875, + "completions/mean_terminated_length": 251.5406265258789, + "completions/min_length": 83.5, + "completions/min_terminated_length": 83.5, + "epoch": 0.2035, + "grad_norm": 17.28472137451172, + "kl": 5.9765625, + "learning_rate": 1.9360595357389735e-05, + "loss": 1.0293, + "num_tokens": 16088407.0, + "reward": 1.45703125, + "reward_std": 0.6616154760122299, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.48866813629865646, + "rewards/tag_count_reward/mean": 0.86328125, + "rewards/tag_count_reward/std": 0.22483261674642563, + "step": 407, + "token_counts/after_target": 553.75, + "token_counts/after_think": 212.75, + "token_counts/before_target": 1935.75, + "token_counts/before_think": 1524.75 + }, + { + "avg_penalty/after_target": 1.944115698337555, + "avg_penalty/after_think": 3.794728994369507, + "avg_penalty/before_target": 0.3804113119840622, + "avg_penalty/before_think": 0.48259287327528, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.5, + "completions/max_terminated_length": 564.5, + "completions/mean_length": 250.546875, + "completions/mean_terminated_length": 250.546875, + "completions/min_length": 102.25, + "completions/min_terminated_length": 102.25, + "epoch": 0.204, + "grad_norm": 9.629474639892578, + "kl": 10.203125, + "learning_rate": 1.9354440308298676e-05, + "loss": 1.1929, + "num_tokens": 16114762.0, + "reward": 1.390625, + "reward_std": 0.7387124300003052, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.5049516260623932, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.28767210245132446, + "step": 408, + "token_counts/after_target": 524.75, + "token_counts/after_think": 171.5, + "token_counts/before_target": 2118.25, + "token_counts/before_think": 1194.25 + }, + { + "avg_penalty/after_target": 2.897066295146942, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4417809173464775, + "avg_penalty/before_think": 0.422883115708828, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 672.75, + "completions/max_terminated_length": 672.75, + "completions/mean_length": 258.296875, + "completions/mean_terminated_length": 258.296875, + "completions/min_length": 90.25, + "completions/min_terminated_length": 90.25, + "epoch": 0.2045, + "grad_norm": 5.960118770599365, + "kl": 16.0625, + "learning_rate": 1.9348256763960146e-05, + "loss": 1.5786, + "num_tokens": 16142813.0, + "reward": 1.28515625, + "reward_std": 0.7556709498167038, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.515625, + "rewards/format_reward/std": 0.49654312431812286, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.3178487718105316, + "step": 409, + "token_counts/after_target": 769.0, + "token_counts/after_think": 104.5, + "token_counts/before_target": 1878.5, + "token_counts/before_think": 1380.75 + }, + { + "avg_penalty/after_target": 2.205017864704132, + "avg_penalty/after_think": 2.982102394104004, + "avg_penalty/before_target": 0.39029061794281006, + "avg_penalty/before_think": 0.5300530716776848, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 269.4375, + "completions/mean_terminated_length": 269.4375, + "completions/min_length": 75.25, + "completions/min_terminated_length": 75.25, + "epoch": 0.205, + "grad_norm": 16.465198516845703, + "kl": 26.21875, + "learning_rate": 1.9342044743210295e-05, + "loss": 1.9026, + "num_tokens": 16171993.0, + "reward": 1.07421875, + "reward_std": 0.7862026691436768, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.390625, + "rewards/format_reward/std": 0.498777836561203, + "rewards/tag_count_reward/mean": 0.68359375, + "rewards/tag_count_reward/std": 0.36345522105693817, + "step": 410, + "token_counts/after_target": 892.25, + "token_counts/after_think": 104.5, + "token_counts/before_target": 2316.0, + "token_counts/before_think": 998.25 + }, + { + "avg_penalty/after_target": 2.6240633726119995, + "avg_penalty/after_think": 2.855885684490204, + "avg_penalty/before_target": 0.3701094053685665, + "avg_penalty/before_think": 0.5148376002907753, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 699.5, + "completions/max_terminated_length": 638.75, + "completions/mean_length": 256.625, + "completions/mean_terminated_length": 245.7125015258789, + "completions/min_length": 73.25, + "completions/min_terminated_length": 73.25, + "epoch": 0.2055, + "grad_norm": 18.950368881225586, + "kl": 30.5, + "learning_rate": 1.9335804264972018e-05, + "loss": 2.1487, + "num_tokens": 16198641.0, + "reward": 1.15625, + "reward_std": 0.8002200722694397, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.4375, + "rewards/format_reward/std": 0.4970766380429268, + "rewards/tag_count_reward/mean": 0.71875, + "rewards/tag_count_reward/std": 0.38619082421064377, + "step": 411, + "token_counts/after_target": 752.25, + "token_counts/after_think": 67.25, + "token_counts/before_target": 2104.0, + "token_counts/before_think": 1182.5 + }, + { + "avg_penalty/after_target": 2.60427126288414, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3124295063316822, + "avg_penalty/before_think": 0.4764684736728668, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 595.5, + "completions/max_terminated_length": 595.5, + "completions/mean_length": 245.21875, + "completions/mean_terminated_length": 245.21875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.206, + "grad_norm": 17.69639015197754, + "kl": 28.03125, + "learning_rate": 1.9329535348254893e-05, + "loss": 2.0329, + "num_tokens": 16222351.0, + "reward": 1.26171875, + "reward_std": 0.8326871693134308, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.53125, + "rewards/format_reward/std": 0.49776528775691986, + "rewards/tag_count_reward/mean": 0.73046875, + "rewards/tag_count_reward/std": 0.39316124469041824, + "step": 412, + "token_counts/after_target": 654.75, + "token_counts/after_think": 89.75, + "token_counts/before_target": 2028.25, + "token_counts/before_think": 1150.75 + }, + { + "avg_penalty/after_target": 2.2255199551582336, + "avg_penalty/after_think": 2.7917752861976624, + "avg_penalty/before_target": 0.5998575389385223, + "avg_penalty/before_think": 0.5327797085046768, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 813.75, + "completions/max_terminated_length": 755.75, + "completions/mean_length": 276.3125, + "completions/mean_terminated_length": 264.828125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.2065, + "grad_norm": 5.850002765655518, + "kl": 25.203125, + "learning_rate": 1.9323238012155125e-05, + "loss": 2.1738, + "num_tokens": 16250131.0, + "reward": 1.3046875, + "reward_std": 0.7740167677402496, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.498777836561203, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.36535871773958206, + "step": 413, + "token_counts/after_target": 1250.25, + "token_counts/after_think": 56.5, + "token_counts/before_target": 2129.0, + "token_counts/before_think": 985.25 + }, + { + "avg_penalty/after_target": 3.3156668543815613, + "avg_penalty/after_think": 3.898883283138275, + "avg_penalty/before_target": 0.2936146482825279, + "avg_penalty/before_think": 0.4489807039499283, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.25, + "completions/max_terminated_length": 480.25, + "completions/mean_length": 227.046875, + "completions/mean_terminated_length": 227.046875, + "completions/min_length": 103.5, + "completions/min_terminated_length": 103.5, + "epoch": 0.207, + "grad_norm": 4.870482921600342, + "kl": 12.65625, + "learning_rate": 1.931691227585549e-05, + "loss": 1.2298, + "num_tokens": 16274534.0, + "reward": 1.48828125, + "reward_std": 0.7419796586036682, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.48025963455438614, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.32931485027074814, + "step": 414, + "token_counts/after_target": 554.5, + "token_counts/after_think": 81.25, + "token_counts/before_target": 1695.25, + "token_counts/before_think": 1301.75 + }, + { + "avg_penalty/after_target": 3.0803638696670532, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3186888284981251, + "avg_penalty/before_think": 0.3384712040424347, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.25, + "completions/max_terminated_length": 486.25, + "completions/mean_length": 236.25, + "completions/mean_terminated_length": 236.25, + "completions/min_length": 87.5, + "completions/min_terminated_length": 87.5, + "epoch": 0.2075, + "grad_norm": 5.409963607788086, + "kl": 18.90625, + "learning_rate": 1.9310558158625286e-05, + "loss": 1.4317, + "num_tokens": 16298630.0, + "reward": 1.47265625, + "reward_std": 0.7953952252864838, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.457730233669281, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.37959881126880646, + "step": 415, + "token_counts/after_target": 566.75, + "token_counts/after_think": 112.0, + "token_counts/before_target": 1926.5, + "token_counts/before_think": 1174.75 + }, + { + "avg_penalty/after_target": 2.6092889308929443, + "avg_penalty/after_think": 3.4396694898605347, + "avg_penalty/before_target": 0.39100120961666107, + "avg_penalty/before_think": 0.4210973381996155, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.25, + "completions/max_terminated_length": 472.25, + "completions/mean_length": 226.484375, + "completions/mean_terminated_length": 226.484375, + "completions/min_length": 58.25, + "completions/min_terminated_length": 58.25, + "epoch": 0.208, + "grad_norm": 9.142251968383789, + "kl": 23.625, + "learning_rate": 1.9304175679820247e-05, + "loss": 1.746, + "num_tokens": 16326469.0, + "reward": 1.2421875, + "reward_std": 0.8213056474924088, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.5625, + "rewards/format_reward/std": 0.4665650501847267, + "rewards/tag_count_reward/mean": 0.6796875, + "rewards/tag_count_reward/std": 0.3978366553783417, + "step": 416, + "token_counts/after_target": 695.75, + "token_counts/after_think": 67.25, + "token_counts/before_target": 1801.25, + "token_counts/before_think": 1059.5 + }, + { + "avg_penalty/after_target": 2.8562105894088745, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.256984356790781, + "avg_penalty/before_think": 0.4694726914167404, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.75, + "completions/max_terminated_length": 416.75, + "completions/mean_length": 185.765625, + "completions/mean_terminated_length": 185.765625, + "completions/min_length": 66.5, + "completions/min_terminated_length": 66.5, + "epoch": 0.2085, + "grad_norm": 3.1218559741973877, + "kl": 20.25, + "learning_rate": 1.9297764858882516e-05, + "loss": 1.5544, + "num_tokens": 16349286.0, + "reward": 1.38671875, + "reward_std": 0.9412729889154434, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.4876555874943733, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.43676090240478516, + "step": 417, + "token_counts/after_target": 402.5, + "token_counts/after_think": 40.5, + "token_counts/before_target": 1752.25, + "token_counts/before_think": 777.0 + }, + { + "avg_penalty/after_target": 2.9903857111930847, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.259342797100544, + "avg_penalty/before_think": 0.4679728299379349, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.25, + "completions/max_terminated_length": 444.25, + "completions/mean_length": 180.34375, + "completions/mean_terminated_length": 180.34375, + "completions/min_length": 65.25, + "completions/min_terminated_length": 65.25, + "epoch": 0.209, + "grad_norm": 4.143587112426758, + "kl": 19.046875, + "learning_rate": 1.9291325715340562e-05, + "loss": 1.5415, + "num_tokens": 16370060.0, + "reward": 1.359375, + "reward_std": 0.8777407705783844, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.48558124154806137, + "rewards/tag_count_reward/mean": 0.703125, + "rewards/tag_count_reward/std": 0.43658555299043655, + "step": 418, + "token_counts/after_target": 271.75, + "token_counts/after_think": 108.0, + "token_counts/before_target": 1913.0, + "token_counts/before_think": 592.75 + }, + { + "avg_penalty/after_target": 2.5052300691604614, + "avg_penalty/after_think": 3.611207664012909, + "avg_penalty/before_target": 0.3221721351146698, + "avg_penalty/before_think": 0.37606392800807953, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 186.734375, + "completions/mean_terminated_length": 186.734375, + "completions/min_length": 92.25, + "completions/min_terminated_length": 92.25, + "epoch": 0.2095, + "grad_norm": 13.556183815002441, + "kl": 8.546875, + "learning_rate": 1.9284858268809135e-05, + "loss": 1.0639, + "num_tokens": 16392059.0, + "reward": 1.6640625, + "reward_std": 0.6941564157605171, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4097762927412987, + "rewards/tag_count_reward/mean": 0.8671875, + "rewards/tag_count_reward/std": 0.3100050464272499, + "step": 419, + "token_counts/after_target": 282.5, + "token_counts/after_think": 94.5, + "token_counts/before_target": 1722.25, + "token_counts/before_think": 888.5 + }, + { + "avg_penalty/after_target": 3.5007471442222595, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.271180447191, + "avg_penalty/before_think": 0.3039815127849579, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.25, + "completions/max_terminated_length": 399.25, + "completions/mean_length": 166.109375, + "completions/mean_terminated_length": 166.109375, + "completions/min_length": 64.5, + "completions/min_terminated_length": 64.5, + "epoch": 0.21, + "grad_norm": 18.406192779541016, + "kl": 12.2421875, + "learning_rate": 1.92783625389892e-05, + "loss": 1.4439, + "num_tokens": 16414690.0, + "reward": 1.43359375, + "reward_std": 0.8410352915525436, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4581565484404564, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.4124985709786415, + "step": 420, + "token_counts/after_target": 434.5, + "token_counts/after_think": 41.5, + "token_counts/before_target": 1557.75, + "token_counts/before_think": 624.0 + }, + { + "avg_penalty/after_target": 1.8515137732028961, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.2686120867729187, + "avg_penalty/before_think": 0.36393189430236816, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.75, + "completions/max_terminated_length": 389.75, + "completions/mean_length": 147.1875, + "completions/mean_terminated_length": 147.1875, + "completions/min_length": 60.75, + "completions/min_terminated_length": 60.75, + "epoch": 0.2105, + "grad_norm": 8.977372169494629, + "kl": 11.86328125, + "learning_rate": 1.9271838545667876e-05, + "loss": 1.2043, + "num_tokens": 16433406.0, + "reward": 1.5, + "reward_std": 0.7906631082296371, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.46034691482782364, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.3840211033821106, + "step": 421, + "token_counts/after_target": 210.75, + "token_counts/after_think": 69.75, + "token_counts/before_target": 1313.75, + "token_counts/before_think": 760.75 + }, + { + "avg_penalty/after_target": 3.032051980495453, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3743552267551422, + "avg_penalty/before_think": 0.24403229355812073, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 190.1875, + "completions/mean_terminated_length": 190.1875, + "completions/min_length": 71.75, + "completions/min_terminated_length": 71.75, + "epoch": 0.211, + "grad_norm": 18.024085998535156, + "kl": 17.6484375, + "learning_rate": 1.9265286308718374e-05, + "loss": 1.923, + "num_tokens": 16456234.0, + "reward": 1.31640625, + "reward_std": 0.7995649725198746, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.44159944355487823, + "rewards/tag_count_reward/mean": 0.67578125, + "rewards/tag_count_reward/std": 0.39334407448768616, + "step": 422, + "token_counts/after_target": 763.0, + "token_counts/after_think": 27.25, + "token_counts/before_target": 1470.75, + "token_counts/before_think": 782.0 + }, + { + "avg_penalty/after_target": 2.112022191286087, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4553000330924988, + "avg_penalty/before_think": 0.46144747734069824, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 578.25, + "completions/max_terminated_length": 578.25, + "completions/mean_length": 198.78125, + "completions/mean_terminated_length": 198.78125, + "completions/min_length": 46.75, + "completions/min_terminated_length": 46.75, + "epoch": 0.2115, + "grad_norm": 4.168260097503662, + "kl": 22.84375, + "learning_rate": 1.925870584809995e-05, + "loss": 2.0441, + "num_tokens": 16482668.0, + "reward": 1.35546875, + "reward_std": 0.8422169536352158, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.44495995342731476, + "rewards/tag_count_reward/mean": 0.66796875, + "rewards/tag_count_reward/std": 0.4173430800437927, + "step": 423, + "token_counts/after_target": 572.75, + "token_counts/after_think": 46.25, + "token_counts/before_target": 2058.25, + "token_counts/before_think": 503.25 + }, + { + "avg_penalty/after_target": 2.517243891954422, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.2873798720538616, + "avg_penalty/before_think": 0.3235159255564213, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 171.421875, + "completions/mean_terminated_length": 171.421875, + "completions/min_length": 55.25, + "completions/min_terminated_length": 55.25, + "epoch": 0.212, + "grad_norm": 5.0070600509643555, + "kl": 17.59375, + "learning_rate": 1.9252097183857822e-05, + "loss": 1.4291, + "num_tokens": 16500983.0, + "reward": 1.390625, + "reward_std": 0.7838190346956253, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.46566852182149887, + "rewards/tag_count_reward/mean": 0.703125, + "rewards/tag_count_reward/std": 0.3678334951400757, + "step": 424, + "token_counts/after_target": 292.5, + "token_counts/after_think": 37.0, + "token_counts/before_target": 1687.5, + "token_counts/before_think": 725.75 + }, + { + "avg_penalty/after_target": 2.579320251941681, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 0.5488417334854603, + "avg_penalty/before_think": 0.4661477282643318, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 870.75, + "completions/max_terminated_length": 814.0, + "completions/mean_length": 329.421875, + "completions/mean_terminated_length": 299.3787307739258, + "completions/min_length": 16.75, + "completions/min_terminated_length": 16.75, + "epoch": 0.2125, + "grad_norm": 45.068294525146484, + "kl": 47.8125, + "learning_rate": 1.9245460336123136e-05, + "loss": 2.9591, + "num_tokens": 16534034.0, + "reward": 0.34375, + "reward_std": 0.46742770820856094, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.25, + "rewards/tag_count_reward/mean": 0.28125, + "rewards/tag_count_reward/std": 0.2928798720240593, + "step": 425, + "token_counts/after_target": 1674.5, + "token_counts/after_think": 0.0, + "token_counts/before_target": 3140.75, + "token_counts/before_think": 455.5 + }, + { + "avg_penalty/after_target": 2.5480183959007263, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 0.4412916526198387, + "avg_penalty/before_think": 0.3654787950217724, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.75, + "completions/max_terminated_length": 703.75, + "completions/mean_length": 291.859375, + "completions/mean_terminated_length": 291.859375, + "completions/min_length": 13.25, + "completions/min_terminated_length": 13.25, + "epoch": 0.213, + "grad_norm": 46.29287338256836, + "kl": 48.375, + "learning_rate": 1.9238795325112867e-05, + "loss": 2.8493, + "num_tokens": 16561401.0, + "reward": 0.2265625, + "reward_std": 0.2436629682779312, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.2265625, + "rewards/tag_count_reward/std": 0.2436629720032215, + "step": 426, + "token_counts/after_target": 1267.5, + "token_counts/after_think": 0.0, + "token_counts/before_target": 3118.25, + "token_counts/before_think": 284.0 + }, + { + "avg_penalty/after_target": 2.480224072933197, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.5625372491776943, + "avg_penalty/before_think": 2.0273759216070175, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 732.75, + "completions/max_terminated_length": 701.25, + "completions/mean_length": 299.96875, + "completions/mean_terminated_length": 272.6183052062988, + "completions/min_length": 15.25, + "completions/min_terminated_length": 15.25, + "epoch": 0.2135, + "grad_norm": 31.489761352539062, + "kl": 41.0625, + "learning_rate": 1.923210217112981e-05, + "loss": 2.7245, + "num_tokens": 16593207.0, + "reward": 0.35546875, + "reward_std": 0.4291645213961601, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.14789126068353653, + "rewards/tag_count_reward/mean": 0.29296875, + "rewards/tag_count_reward/std": 0.3182108849287033, + "step": 427, + "token_counts/after_target": 1420.75, + "token_counts/after_think": 145.75, + "token_counts/before_target": 2822.0, + "token_counts/before_think": 411.0 + }, + { + "avg_penalty/after_target": 2.9815708994865417, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.3364066928625107, + "avg_penalty/before_think": 0.5396645665168762, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.25, + "completions/max_terminated_length": 679.25, + "completions/mean_length": 291.5625, + "completions/mean_terminated_length": 291.5625, + "completions/min_length": 38.75, + "completions/min_terminated_length": 38.75, + "epoch": 0.214, + "grad_norm": 9.61830997467041, + "kl": 24.75, + "learning_rate": 1.9225380894562466e-05, + "loss": 1.9644, + "num_tokens": 16625147.0, + "reward": 0.55859375, + "reward_std": 0.661612331867218, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.171875, + "rewards/format_reward/std": 0.3683478757739067, + "rewards/tag_count_reward/mean": 0.38671875, + "rewards/tag_count_reward/std": 0.3634301424026489, + "step": 428, + "token_counts/after_target": 1219.0, + "token_counts/after_think": 16.25, + "token_counts/before_target": 2653.75, + "token_counts/before_think": 776.0 + }, + { + "avg_penalty/after_target": 2.4363613724708557, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4216156080365181, + "avg_penalty/before_think": 0.750138521194458, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 578.25, + "completions/max_terminated_length": 578.25, + "completions/mean_length": 240.640625, + "completions/mean_terminated_length": 240.640625, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.2145, + "grad_norm": 7.6084747314453125, + "kl": 16.25, + "learning_rate": 1.9218631515885007e-05, + "loss": 1.5112, + "num_tokens": 16650244.0, + "reward": 0.9140625, + "reward_std": 0.8404648154973984, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.48866813629865646, + "rewards/tag_count_reward/mean": 0.5390625, + "rewards/tag_count_reward/std": 0.4095943048596382, + "step": 429, + "token_counts/after_target": 813.75, + "token_counts/after_think": 28.0, + "token_counts/before_target": 2519.0, + "token_counts/before_think": 489.5 + }, + { + "avg_penalty/after_target": 2.4721890091896057, + "avg_penalty/after_think": 3.439653366804123, + "avg_penalty/before_target": 0.4137706086039543, + "avg_penalty/before_think": 0.4576721787452698, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.5, + "completions/max_terminated_length": 609.5, + "completions/mean_length": 241.125, + "completions/mean_terminated_length": 241.125, + "completions/min_length": 66.5, + "completions/min_terminated_length": 66.5, + "epoch": 0.215, + "grad_norm": 15.163606643676758, + "kl": 10.015625, + "learning_rate": 1.9211854055657216e-05, + "loss": 1.3882, + "num_tokens": 16675228.0, + "reward": 1.171875, + "reward_std": 0.8621248602867126, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.484375, + "rewards/format_reward/std": 0.498777836561203, + "rewards/tag_count_reward/mean": 0.671875, + "rewards/tag_count_reward/std": 0.40500323474407196, + "step": 430, + "token_counts/after_target": 855.5, + "token_counts/after_think": 40.0, + "token_counts/before_target": 2260.75, + "token_counts/before_think": 701.75 + }, + { + "avg_penalty/after_target": 2.6812936067581177, + "avg_penalty/after_think": 0.48410773277282715, + "avg_penalty/before_target": 0.2320276014506817, + "avg_penalty/before_think": 0.45259228721261024, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 197.578125, + "completions/mean_terminated_length": 197.578125, + "completions/min_length": 28.75, + "completions/min_terminated_length": 28.75, + "epoch": 0.2155, + "grad_norm": 6.050086975097656, + "kl": 11.96875, + "learning_rate": 1.9205048534524405e-05, + "loss": 1.1801, + "num_tokens": 16696305.0, + "reward": 1.01171875, + "reward_std": 0.7497120201587677, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.359375, + "rewards/format_reward/std": 0.48558124154806137, + "rewards/tag_count_reward/mean": 0.65234375, + "rewards/tag_count_reward/std": 0.34656211733818054, + "step": 431, + "token_counts/after_target": 386.5, + "token_counts/after_think": 9.75, + "token_counts/before_target": 2083.75, + "token_counts/before_think": 681.25 + }, + { + "avg_penalty/after_target": 2.9925793409347534, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 0.30501386523246765, + "avg_penalty/before_think": 0.4076165333390236, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.5, + "completions/max_terminated_length": 586.5, + "completions/mean_length": 235.5, + "completions/mean_terminated_length": 235.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.216, + "grad_norm": 7.468707084655762, + "kl": 16.4375, + "learning_rate": 1.919821497321738e-05, + "loss": 1.6372, + "num_tokens": 16719921.0, + "reward": 0.7421875, + "reward_std": 0.6717770397663116, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.1875, + "rewards/format_reward/std": 0.37937305867671967, + "rewards/tag_count_reward/mean": 0.5546875, + "rewards/tag_count_reward/std": 0.38613446056842804, + "step": 432, + "token_counts/after_target": 848.75, + "token_counts/after_think": 0.0, + "token_counts/before_target": 2246.75, + "token_counts/before_think": 672.5 + }, + { + "avg_penalty/after_target": 2.01770681142807, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3615508899092674, + "avg_penalty/before_think": 0.708089143037796, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.5, + "completions/max_terminated_length": 702.5, + "completions/mean_length": 232.9375, + "completions/mean_terminated_length": 232.9375, + "completions/min_length": 16.25, + "completions/min_terminated_length": 16.25, + "epoch": 0.2165, + "grad_norm": 8.727030754089355, + "kl": 22.0, + "learning_rate": 1.9191353392552346e-05, + "loss": 1.6872, + "num_tokens": 16746013.0, + "reward": 0.81640625, + "reward_std": 0.7899299412965775, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.28125, + "rewards/format_reward/std": 0.45247192680835724, + "rewards/tag_count_reward/mean": 0.53515625, + "rewards/tag_count_reward/std": 0.4044773057103157, + "step": 433, + "token_counts/after_target": 639.75, + "token_counts/after_think": 19.0, + "token_counts/before_target": 2574.75, + "token_counts/before_think": 493.5 + }, + { + "avg_penalty/after_target": 2.7248952984809875, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 0.2846469208598137, + "avg_penalty/before_think": 0.40338122472167015, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.0, + "completions/max_terminated_length": 544.0, + "completions/mean_length": 241.25, + "completions/mean_terminated_length": 241.25, + "completions/min_length": 39.5, + "completions/min_terminated_length": 39.5, + "epoch": 0.217, + "grad_norm": 11.342061042785645, + "kl": 23.1875, + "learning_rate": 1.9184463813430874e-05, + "loss": 1.6687, + "num_tokens": 16769661.0, + "reward": 0.9453125, + "reward_std": 0.8486528098583221, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.359375, + "rewards/format_reward/std": 0.49467839300632477, + "rewards/tag_count_reward/mean": 0.5703125, + "rewards/tag_count_reward/std": 0.41340167075395584, + "step": 434, + "token_counts/after_target": 621.0, + "token_counts/after_think": 0.0, + "token_counts/before_target": 2797.5, + "token_counts/before_think": 441.5 + }, + { + "avg_penalty/after_target": 2.3429446816444397, + "avg_penalty/after_think": 1.8387734293937683, + "avg_penalty/before_target": 0.30798186361789703, + "avg_penalty/before_think": 0.4719417802989483, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.75, + "completions/max_terminated_length": 559.75, + "completions/mean_length": 223.625, + "completions/mean_terminated_length": 223.625, + "completions/min_length": 53.75, + "completions/min_terminated_length": 53.75, + "epoch": 0.2175, + "grad_norm": 3.5132980346679688, + "kl": 15.40625, + "learning_rate": 1.9177546256839814e-05, + "loss": 1.4157, + "num_tokens": 16792309.0, + "reward": 1.1328125, + "reward_std": 0.8117695450782776, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.4375, + "rewards/format_reward/std": 0.5102732330560684, + "rewards/tag_count_reward/mean": 0.6953125, + "rewards/tag_count_reward/std": 0.35007575154304504, + "step": 435, + "token_counts/after_target": 537.0, + "token_counts/after_think": 63.5, + "token_counts/before_target": 2042.5, + "token_counts/before_think": 935.0 + }, + { + "avg_penalty/after_target": 2.2756866216659546, + "avg_penalty/after_think": 2.828365683555603, + "avg_penalty/before_target": 0.36475761234760284, + "avg_penalty/before_think": 0.4329071193933487, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 196.359375, + "completions/mean_terminated_length": 196.359375, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.218, + "grad_norm": 14.012145042419434, + "kl": 11.203125, + "learning_rate": 1.917060074385124e-05, + "loss": 1.3022, + "num_tokens": 16814956.0, + "reward": 1.4296875, + "reward_std": 0.8001967370510101, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.49467839300632477, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.34311313554644585, + "step": 436, + "token_counts/after_target": 594.75, + "token_counts/after_think": 67.75, + "token_counts/before_target": 1623.0, + "token_counts/before_think": 856.25 + }, + { + "avg_penalty/after_target": 2.4168174862861633, + "avg_penalty/after_think": 3.4211471676826477, + "avg_penalty/before_target": 0.355468075722456, + "avg_penalty/before_think": 0.4160300940275192, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.0, + "completions/max_terminated_length": 634.0, + "completions/mean_length": 228.125, + "completions/mean_terminated_length": 228.125, + "completions/min_length": 59.25, + "completions/min_terminated_length": 59.25, + "epoch": 0.2185, + "grad_norm": 6.0580902099609375, + "kl": 18.484375, + "learning_rate": 1.9163627295622397e-05, + "loss": 1.6937, + "num_tokens": 16838228.0, + "reward": 1.296875, + "reward_std": 0.8597573935985565, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.49606408923864365, + "rewards/tag_count_reward/mean": 0.71875, + "rewards/tag_count_reward/std": 0.3937726318836212, + "step": 437, + "token_counts/after_target": 529.5, + "token_counts/after_think": 58.25, + "token_counts/before_target": 2236.5, + "token_counts/before_think": 825.75 + }, + { + "avg_penalty/after_target": 3.0391968190670013, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.25833388045430183, + "avg_penalty/before_think": 0.548102892935276, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 596.25, + "completions/max_terminated_length": 596.25, + "completions/mean_length": 215.828125, + "completions/mean_terminated_length": 215.828125, + "completions/min_length": 69.75, + "completions/min_terminated_length": 69.75, + "epoch": 0.219, + "grad_norm": 2.708965539932251, + "kl": 19.96875, + "learning_rate": 1.9156625933395614e-05, + "loss": 1.6454, + "num_tokens": 16863609.0, + "reward": 1.30078125, + "reward_std": 0.8160281628370285, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.5625, + "rewards/format_reward/std": 0.4970766380429268, + "rewards/tag_count_reward/mean": 0.72265625, + "rewards/tag_count_reward/std": 0.3663733899593353, + "step": 438, + "token_counts/after_target": 486.5, + "token_counts/after_think": 152.75, + "token_counts/before_target": 2211.75, + "token_counts/before_think": 602.25 + }, + { + "avg_penalty/after_target": 2.1462994515895844, + "avg_penalty/after_think": 2.581453800201416, + "avg_penalty/before_target": 0.2969907857477665, + "avg_penalty/before_think": 0.4160325825214386, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.5, + "completions/max_terminated_length": 526.5, + "completions/mean_length": 188.25, + "completions/mean_terminated_length": 188.25, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.2195, + "grad_norm": 10.715665817260742, + "kl": 23.96875, + "learning_rate": 1.914959667849825e-05, + "loss": 1.7075, + "num_tokens": 16886009.0, + "reward": 1.21484375, + "reward_std": 0.8934095203876495, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.48989029973745346, + "rewards/tag_count_reward/mean": 0.65234375, + "rewards/tag_count_reward/std": 0.41929131746292114, + "step": 439, + "token_counts/after_target": 360.5, + "token_counts/after_think": 54.75, + "token_counts/before_target": 2039.5, + "token_counts/before_think": 557.25 + }, + { + "avg_penalty/after_target": 2.2987721264362335, + "avg_penalty/after_think": 2.887602150440216, + "avg_penalty/before_target": 0.33730338513851166, + "avg_penalty/before_think": 0.43974071741104126, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.75, + "completions/max_terminated_length": 536.75, + "completions/mean_length": 188.859375, + "completions/mean_terminated_length": 188.859375, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.22, + "grad_norm": 3.201742172241211, + "kl": 19.09375, + "learning_rate": 1.9142539552342638e-05, + "loss": 1.6608, + "num_tokens": 16908368.0, + "reward": 1.44921875, + "reward_std": 0.7709725052118301, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4284028485417366, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.34764593094587326, + "step": 440, + "token_counts/after_target": 493.25, + "token_counts/after_think": 58.25, + "token_counts/before_target": 1690.75, + "token_counts/before_think": 779.5 + }, + { + "avg_penalty/after_target": 2.2175504565238953, + "avg_penalty/after_think": 2.978985071182251, + "avg_penalty/before_target": 0.4665267914533615, + "avg_penalty/before_think": 0.6304388120770454, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 668.5, + "completions/max_terminated_length": 666.75, + "completions/mean_length": 237.640625, + "completions/mean_terminated_length": 226.6135482788086, + "completions/min_length": 60.5, + "completions/min_terminated_length": 60.5, + "epoch": 0.2205, + "grad_norm": 4.650007724761963, + "kl": 21.984375, + "learning_rate": 1.913545457642601e-05, + "loss": 2.0071, + "num_tokens": 16936393.0, + "reward": 1.30859375, + "reward_std": 0.8441546857357025, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.4745560586452484, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.41291818022727966, + "step": 441, + "token_counts/after_target": 1041.75, + "token_counts/after_think": 94.75, + "token_counts/before_target": 1974.0, + "token_counts/before_think": 691.75 + }, + { + "avg_penalty/after_target": 2.1800880134105682, + "avg_penalty/after_think": 3.8188217878341675, + "avg_penalty/before_target": 0.4009094275534153, + "avg_penalty/before_think": 0.41406024992465973, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.25, + "completions/max_terminated_length": 452.25, + "completions/mean_length": 202.796875, + "completions/mean_terminated_length": 202.796875, + "completions/min_length": 70.75, + "completions/min_terminated_length": 70.75, + "epoch": 0.221, + "grad_norm": 4.3921332359313965, + "kl": 20.546875, + "learning_rate": 1.9128341772330428e-05, + "loss": 1.6785, + "num_tokens": 16960044.0, + "reward": 1.41796875, + "reward_std": 0.967670127749443, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.125, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.48456869274377823, + "rewards/tag_count_reward/mean": 0.69921875, + "rewards/tag_count_reward/std": 0.4329608753323555, + "step": 442, + "token_counts/after_target": 603.75, + "token_counts/after_think": 53.5, + "token_counts/before_target": 1957.5, + "token_counts/before_think": 630.0 + }, + { + "avg_penalty/after_target": 3.036925584077835, + "avg_penalty/after_think": 1.6881200671195984, + "avg_penalty/before_target": 0.23790273815393448, + "avg_penalty/before_think": 0.4668658673763275, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 184.4375, + "completions/mean_terminated_length": 184.4375, + "completions/min_length": 46.75, + "completions/min_terminated_length": 46.75, + "epoch": 0.2215, + "grad_norm": 8.046587944030762, + "kl": 23.6875, + "learning_rate": 1.9121201161722732e-05, + "loss": 1.7412, + "num_tokens": 16979288.0, + "reward": 1.02734375, + "reward_std": 0.8791394680738449, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.4375, + "rewards/format_reward/std": 0.48025963455438614, + "rewards/tag_count_reward/mean": 0.58984375, + "rewards/tag_count_reward/std": 0.4458109959959984, + "step": 443, + "token_counts/after_target": 408.75, + "token_counts/after_think": 34.25, + "token_counts/before_target": 1954.5, + "token_counts/before_think": 553.5 + }, + { + "avg_penalty/after_target": 2.217905879020691, + "avg_penalty/after_think": 3.2500825226306915, + "avg_penalty/before_target": 0.2843176871538162, + "avg_penalty/before_think": 0.4061814025044441, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.75, + "completions/max_terminated_length": 443.75, + "completions/mean_length": 192.921875, + "completions/mean_terminated_length": 192.921875, + "completions/min_length": 73.75, + "completions/min_terminated_length": 73.75, + "epoch": 0.222, + "grad_norm": 3.802492141723633, + "kl": 17.40625, + "learning_rate": 1.9114032766354453e-05, + "loss": 1.36, + "num_tokens": 17002099.0, + "reward": 1.23046875, + "reward_std": 0.9588146656751633, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.5040994435548782, + "rewards/tag_count_reward/mean": 0.63671875, + "rewards/tag_count_reward/std": 0.47025061398744583, + "step": 444, + "token_counts/after_target": 260.75, + "token_counts/after_think": 77.75, + "token_counts/before_target": 2054.75, + "token_counts/before_think": 693.5 + }, + { + "avg_penalty/after_target": 1.9472506642341614, + "avg_penalty/after_think": 1.9102085828781128, + "avg_penalty/before_target": 0.364621564745903, + "avg_penalty/before_think": 0.6479305401444435, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.75, + "completions/max_terminated_length": 602.75, + "completions/mean_length": 206.453125, + "completions/mean_terminated_length": 206.453125, + "completions/min_length": 57.5, + "completions/min_terminated_length": 57.5, + "epoch": 0.2225, + "grad_norm": 8.114907264709473, + "kl": 14.84375, + "learning_rate": 1.910683660806177e-05, + "loss": 1.4803, + "num_tokens": 17023696.0, + "reward": 1.2890625, + "reward_std": 0.8646083474159241, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.4696519449353218, + "rewards/tag_count_reward/mean": 0.6796875, + "rewards/tag_count_reward/std": 0.42616260051727295, + "step": 445, + "token_counts/after_target": 573.25, + "token_counts/after_think": 106.25, + "token_counts/before_target": 1966.0, + "token_counts/before_think": 657.75 + }, + { + "avg_penalty/after_target": 1.7292157411575317, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4040599390864372, + "avg_penalty/before_think": 0.5055936500430107, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 222.921875, + "completions/mean_terminated_length": 222.921875, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.223, + "grad_norm": 5.0814971923828125, + "kl": 19.0, + "learning_rate": 1.9099612708765432e-05, + "loss": 1.7217, + "num_tokens": 17049131.0, + "reward": 1.109375, + "reward_std": 0.9436095505952835, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.515625, + "rewards/format_reward/std": 0.5028772801160812, + "rewards/tag_count_reward/mean": 0.59375, + "rewards/tag_count_reward/std": 0.4645829051733017, + "step": 446, + "token_counts/after_target": 646.0, + "token_counts/after_think": 58.25, + "token_counts/before_target": 2275.0, + "token_counts/before_think": 587.5 + }, + { + "avg_penalty/after_target": 2.77433580160141, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.4509436897933483, + "avg_penalty/before_think": 0.419439435005188, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 692.25, + "completions/max_terminated_length": 692.25, + "completions/mean_length": 214.03125, + "completions/mean_terminated_length": 214.03125, + "completions/min_length": 44.25, + "completions/min_terminated_length": 44.25, + "epoch": 0.2235, + "grad_norm": 14.86379337310791, + "kl": 16.828125, + "learning_rate": 1.9092361090470688e-05, + "loss": 1.6995, + "num_tokens": 17074509.0, + "reward": 1.12890625, + "reward_std": 0.9135421216487885, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.53125, + "rewards/format_reward/std": 0.49500229209661484, + "rewards/tag_count_reward/mean": 0.59765625, + "rewards/tag_count_reward/std": 0.4410993233323097, + "step": 447, + "token_counts/after_target": 909.25, + "token_counts/after_think": 44.25, + "token_counts/before_target": 1796.5, + "token_counts/before_think": 674.5 + }, + { + "avg_penalty/after_target": 2.233737051486969, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.36118335649371147, + "avg_penalty/before_think": 0.41014672070741653, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.5, + "completions/max_terminated_length": 478.5, + "completions/mean_length": 200.09375, + "completions/mean_terminated_length": 200.09375, + "completions/min_length": 67.75, + "completions/min_terminated_length": 67.75, + "epoch": 0.224, + "grad_norm": 3.191518783569336, + "kl": 19.421875, + "learning_rate": 1.908508177526722e-05, + "loss": 1.592, + "num_tokens": 17097347.0, + "reward": 1.2109375, + "reward_std": 0.8871455788612366, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.5625, + "rewards/format_reward/std": 0.49297719448804855, + "rewards/tag_count_reward/mean": 0.6484375, + "rewards/tag_count_reward/std": 0.43227528035640717, + "step": 448, + "token_counts/after_target": 475.0, + "token_counts/after_think": 49.25, + "token_counts/before_target": 2026.75, + "token_counts/before_think": 650.5 + }, + { + "avg_penalty/after_target": 2.236815869808197, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.32405806705355644, + "avg_penalty/before_think": 0.46574386954307556, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.5, + "completions/max_terminated_length": 426.5, + "completions/mean_length": 165.953125, + "completions/mean_terminated_length": 165.953125, + "completions/min_length": 41.25, + "completions/min_terminated_length": 41.25, + "epoch": 0.2245, + "grad_norm": 6.238887310028076, + "kl": 15.25, + "learning_rate": 1.907777478532909e-05, + "loss": 1.3277, + "num_tokens": 17118080.0, + "reward": 1.50390625, + "reward_std": 0.8944647014141083, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.46513500809669495, + "rewards/tag_count_reward/mean": 0.75390625, + "rewards/tag_count_reward/std": 0.38896510750055313, + "step": 449, + "token_counts/after_target": 311.5, + "token_counts/after_think": 32.25, + "token_counts/before_target": 1607.0, + "token_counts/before_think": 704.5 + }, + { + "avg_penalty/after_target": 3.09943488240242, + "avg_penalty/after_think": 2.939930558204651, + "avg_penalty/before_target": 0.30410410091280937, + "avg_penalty/before_think": 0.3136487454175949, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.25, + "completions/max_terminated_length": 565.25, + "completions/mean_length": 221.0, + "completions/mean_terminated_length": 221.0, + "completions/min_length": 67.5, + "completions/min_terminated_length": 67.5, + "epoch": 0.225, + "grad_norm": 3.3920738697052, + "kl": 23.34375, + "learning_rate": 1.907044014291465e-05, + "loss": 1.9813, + "num_tokens": 17141488.0, + "reward": 1.28125, + "reward_std": 0.9115633368492126, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.49345622956752777, + "rewards/tag_count_reward/mean": 0.65625, + "rewards/tag_count_reward/std": 0.4424765333533287, + "step": 450, + "token_counts/after_target": 594.25, + "token_counts/after_think": 49.0, + "token_counts/before_target": 1989.75, + "token_counts/before_think": 903.0 + }, + { + "avg_penalty/after_target": 2.511749118566513, + "avg_penalty/after_think": 3.9450690746307373, + "avg_penalty/before_target": 0.5342179462313652, + "avg_penalty/before_think": 0.42625535279512405, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 875.25, + "completions/max_terminated_length": 786.0, + "completions/mean_length": 225.640625, + "completions/mean_terminated_length": 198.7477684020996, + "completions/min_length": 56.25, + "completions/min_terminated_length": 56.25, + "epoch": 0.2255, + "grad_norm": 7.945819854736328, + "kl": 25.71875, + "learning_rate": 1.9063077870366504e-05, + "loss": 2.4416, + "num_tokens": 17171145.0, + "reward": 1.3203125, + "reward_std": 0.7926699072122574, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.4598134011030197, + "rewards/tag_count_reward/mean": 0.6953125, + "rewards/tag_count_reward/std": 0.36852042004466057, + "step": 451, + "token_counts/after_target": 941.5, + "token_counts/after_think": 57.75, + "token_counts/before_target": 1959.25, + "token_counts/before_think": 651.75 + }, + { + "avg_penalty/after_target": 2.634349077939987, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3768739104270935, + "avg_penalty/before_think": 0.3184387721121311, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 537.5, + "completions/max_terminated_length": 451.25, + "completions/mean_length": 209.28125, + "completions/mean_terminated_length": 170.54808044433594, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.226, + "grad_norm": 4.3181304931640625, + "kl": 24.90625, + "learning_rate": 1.9055687990111397e-05, + "loss": 2.1552, + "num_tokens": 17193963.0, + "reward": 1.3671875, + "reward_std": 0.876371905207634, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.45916909724473953, + "rewards/tag_count_reward/mean": 0.6953125, + "rewards/tag_count_reward/std": 0.4315378963947296, + "step": 452, + "token_counts/after_target": 934.25, + "token_counts/after_think": 24.0, + "token_counts/before_target": 1810.25, + "token_counts/before_think": 580.0 + }, + { + "avg_penalty/after_target": 2.839801609516144, + "avg_penalty/after_think": 2.7640300393104553, + "avg_penalty/before_target": 0.36546143144369125, + "avg_penalty/before_think": 0.4460003077983856, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 717.75, + "completions/max_terminated_length": 599.75, + "completions/mean_length": 212.890625, + "completions/mean_terminated_length": 200.9947967529297, + "completions/min_length": 75.75, + "completions/min_terminated_length": 75.75, + "epoch": 0.2265, + "grad_norm": 5.8422136306762695, + "kl": 26.140625, + "learning_rate": 1.9048270524660197e-05, + "loss": 2.1857, + "num_tokens": 17223540.0, + "reward": 1.421875, + "reward_std": 0.7694729119539261, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4380975142121315, + "rewards/tag_count_reward/mean": 0.6875, + "rewards/tag_count_reward/std": 0.41032544523477554, + "step": 453, + "token_counts/after_target": 763.25, + "token_counts/after_think": 25.0, + "token_counts/before_target": 1990.75, + "token_counts/before_think": 627.25 + }, + { + "avg_penalty/after_target": 1.9188913702964783, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.36003679037094116, + "avg_penalty/before_think": 0.42969171702861786, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.5, + "completions/max_terminated_length": 413.5, + "completions/mean_length": 155.65625, + "completions/mean_terminated_length": 155.65625, + "completions/min_length": 74.25, + "completions/min_terminated_length": 74.25, + "epoch": 0.227, + "grad_norm": 5.3296709060668945, + "kl": 18.9375, + "learning_rate": 1.9040825496607788e-05, + "loss": 1.5928, + "num_tokens": 17241358.0, + "reward": 1.6328125, + "reward_std": 0.7563875466585159, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.125, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.42078252136707306, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.32126183807849884, + "step": 454, + "token_counts/after_target": 232.5, + "token_counts/after_think": 49.0, + "token_counts/before_target": 1554.25, + "token_counts/before_think": 654.75 + }, + { + "avg_penalty/after_target": 2.787009119987488, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3209053538739681, + "avg_penalty/before_think": 0.45733772963285446, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 173.4375, + "completions/mean_terminated_length": 173.4375, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.2275, + "grad_norm": 3.896562099456787, + "kl": 17.0, + "learning_rate": 1.903335292863301e-05, + "loss": 1.4906, + "num_tokens": 17262746.0, + "reward": 1.64453125, + "reward_std": 0.7182668596506119, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.39476002007722855, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.3423241227865219, + "step": 455, + "token_counts/after_target": 359.0, + "token_counts/after_think": 50.75, + "token_counts/before_target": 1812.5, + "token_counts/before_think": 552.75 + }, + { + "avg_penalty/after_target": 2.853868693113327, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4154205471277237, + "avg_penalty/before_think": 0.457759752869606, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 816.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 255.703125, + "completions/mean_terminated_length": 230.42083740234375, + "completions/min_length": 94.5, + "completions/min_terminated_length": 94.5, + "epoch": 0.228, + "grad_norm": 4.0670485496521, + "kl": 24.28125, + "learning_rate": 1.902585284349861e-05, + "loss": 2.0151, + "num_tokens": 17287559.0, + "reward": 1.4921875, + "reward_std": 0.8229966163635254, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4625816270709038, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.3910074234008789, + "step": 456, + "token_counts/after_target": 766.75, + "token_counts/after_think": 42.0, + "token_counts/before_target": 2613.5, + "token_counts/before_think": 669.0 + }, + { + "avg_penalty/after_target": 2.80430269241333, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3595820665359497, + "avg_penalty/before_think": 0.5430384129285812, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.25, + "completions/max_terminated_length": 433.25, + "completions/mean_length": 223.375, + "completions/mean_terminated_length": 223.375, + "completions/min_length": 86.25, + "completions/min_terminated_length": 86.25, + "epoch": 0.2285, + "grad_norm": 6.393120288848877, + "kl": 6.6796875, + "learning_rate": 1.901832526405114e-05, + "loss": 0.9416, + "num_tokens": 17310207.0, + "reward": 1.8046875, + "reward_std": 0.5188017338514328, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.33406074345111847, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.1932985633611679, + "step": 457, + "token_counts/after_target": 555.0, + "token_counts/after_think": 52.0, + "token_counts/before_target": 2219.5, + "token_counts/before_think": 747.5 + }, + { + "avg_penalty/after_target": 2.390136629343033, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.41310878843069077, + "avg_penalty/before_think": 0.5872625634074211, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.5, + "completions/max_terminated_length": 574.5, + "completions/mean_length": 267.734375, + "completions/mean_terminated_length": 267.734375, + "completions/min_length": 104.75, + "completions/min_terminated_length": 104.75, + "epoch": 0.229, + "grad_norm": 4.103005409240723, + "kl": 17.0, + "learning_rate": 1.9010770213220916e-05, + "loss": 1.5374, + "num_tokens": 17335998.0, + "reward": 1.5234375, + "reward_std": 0.7870520502328873, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.352499857544899, + "step": 458, + "token_counts/after_target": 838.0, + "token_counts/after_think": 14.25, + "token_counts/before_target": 2707.5, + "token_counts/before_think": 724.0 + }, + { + "avg_penalty/after_target": 2.736332267522812, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.450191468000412, + "avg_penalty/before_think": 0.6708419099450111, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 792.5, + "completions/max_terminated_length": 792.5, + "completions/mean_length": 282.890625, + "completions/mean_terminated_length": 282.890625, + "completions/min_length": 78.5, + "completions/min_terminated_length": 78.5, + "epoch": 0.2295, + "grad_norm": 6.135554790496826, + "kl": 24.0625, + "learning_rate": 1.9003187714021936e-05, + "loss": 2.1644, + "num_tokens": 17367127.0, + "reward": 1.44140625, + "reward_std": 0.8446472883224487, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.48148179799318314, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.3919283002614975, + "step": 459, + "token_counts/after_target": 1250.5, + "token_counts/after_think": 71.25, + "token_counts/before_target": 2480.0, + "token_counts/before_think": 724.5 + }, + { + "avg_penalty/after_target": 2.832295835018158, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.34431377798318863, + "avg_penalty/before_think": 0.4283663332462311, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 274.265625, + "completions/mean_terminated_length": 274.265625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.23, + "grad_norm": 19.138023376464844, + "kl": 22.28125, + "learning_rate": 1.8995577789551806e-05, + "loss": 1.6012, + "num_tokens": 17394056.0, + "reward": 1.28515625, + "reward_std": 0.825409546494484, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.5112857818603516, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.354307621717453, + "step": 460, + "token_counts/after_target": 732.75, + "token_counts/after_think": 11.75, + "token_counts/before_target": 2648.75, + "token_counts/before_think": 995.0 + }, + { + "avg_penalty/after_target": 2.7167298793792725, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.38160260766744614, + "avg_penalty/before_think": 0.7227700725197792, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.75, + "completions/max_terminated_length": 656.75, + "completions/mean_length": 340.359375, + "completions/mean_terminated_length": 340.359375, + "completions/min_length": 114.75, + "completions/min_terminated_length": 114.75, + "epoch": 0.2305, + "grad_norm": 22.952821731567383, + "kl": 31.625, + "learning_rate": 1.8987940462991673e-05, + "loss": 2.0989, + "num_tokens": 17426095.0, + "reward": 1.140625, + "reward_std": 0.871597096323967, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.46875, + "rewards/format_reward/std": 0.4713720977306366, + "rewards/tag_count_reward/mean": 0.640625, + "rewards/tag_count_reward/std": 0.41757505387067795, + "step": 461, + "token_counts/after_target": 1127.25, + "token_counts/after_think": 106.0, + "token_counts/before_target": 3599.25, + "token_counts/before_think": 613.25 + }, + { + "avg_penalty/after_target": 2.843329429626465, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4045328348875046, + "avg_penalty/before_think": 0.9606122672557831, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 619.5, + "completions/max_terminated_length": 619.5, + "completions/mean_length": 330.46875, + "completions/mean_terminated_length": 330.46875, + "completions/min_length": 110.25, + "completions/min_terminated_length": 110.25, + "epoch": 0.231, + "grad_norm": 24.217166900634766, + "kl": 33.125, + "learning_rate": 1.8980275757606157e-05, + "loss": 2.2128, + "num_tokens": 17457645.0, + "reward": 1.0390625, + "reward_std": 0.9280295968055725, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.46875, + "rewards/format_reward/std": 0.50393907725811, + "rewards/tag_count_reward/mean": 0.5703125, + "rewards/tag_count_reward/std": 0.44641929119825363, + "step": 462, + "token_counts/after_target": 1375.0, + "token_counts/after_think": 24.75, + "token_counts/before_target": 3178.5, + "token_counts/before_think": 709.25 + }, + { + "avg_penalty/after_target": 2.658986210823059, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.37693658098578453, + "avg_penalty/before_think": 0.8304324597120285, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.75, + "completions/max_terminated_length": 517.75, + "completions/mean_length": 302.46875, + "completions/mean_terminated_length": 302.46875, + "completions/min_length": 101.25, + "completions/min_terminated_length": 101.25, + "epoch": 0.2315, + "grad_norm": 12.108687400817871, + "kl": 23.625, + "learning_rate": 1.8972583696743284e-05, + "loss": 1.6789, + "num_tokens": 17487707.0, + "reward": 1.265625, + "reward_std": 1.006612166762352, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.5061737895011902, + "rewards/tag_count_reward/mean": 0.640625, + "rewards/tag_count_reward/std": 0.4728098288178444, + "step": 463, + "token_counts/after_target": 1178.75, + "token_counts/after_think": 49.0, + "token_counts/before_target": 2853.25, + "token_counts/before_think": 758.5 + }, + { + "avg_penalty/after_target": 2.4354674220085144, + "avg_penalty/after_think": 3.8391841650009155, + "avg_penalty/before_target": 0.36406539753079414, + "avg_penalty/before_think": 0.8360460996627808, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 308.15625, + "completions/mean_terminated_length": 308.15625, + "completions/min_length": 156.75, + "completions/min_terminated_length": 156.75, + "epoch": 0.232, + "grad_norm": 2.948674440383911, + "kl": 16.65625, + "learning_rate": 1.8964864303834408e-05, + "loss": 1.3447, + "num_tokens": 17516837.0, + "reward": 1.2421875, + "reward_std": 0.9516418129205704, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.4896806851029396, + "rewards/tag_count_reward/mean": 0.6328125, + "rewards/tag_count_reward/std": 0.4666813686490059, + "step": 464, + "token_counts/after_target": 980.5, + "token_counts/after_think": 56.75, + "token_counts/before_target": 3232.25, + "token_counts/before_think": 661.0 + }, + { + "avg_penalty/after_target": 2.7517887949943542, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4416544660925865, + "avg_penalty/before_think": 0.6266513466835022, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.75, + "completions/max_terminated_length": 598.75, + "completions/mean_length": 327.609375, + "completions/mean_terminated_length": 327.609375, + "completions/min_length": 134.75, + "completions/min_terminated_length": 134.75, + "epoch": 0.2325, + "grad_norm": 16.020519256591797, + "kl": 9.75, + "learning_rate": 1.895711760239413e-05, + "loss": 1.2481, + "num_tokens": 17547452.0, + "reward": 1.34375, + "reward_std": 0.8814821839332581, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.49345622956752777, + "rewards/tag_count_reward/mean": 0.71875, + "rewards/tag_count_reward/std": 0.41012708097696304, + "step": 465, + "token_counts/after_target": 1196.25, + "token_counts/after_think": 128.75, + "token_counts/before_target": 2878.75, + "token_counts/before_think": 1038.0 + }, + { + "avg_penalty/after_target": 2.7064703702926636, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3985730931162834, + "avg_penalty/before_think": 0.7652269899845123, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 662.25, + "completions/max_terminated_length": 662.25, + "completions/mean_length": 371.265625, + "completions/mean_terminated_length": 371.265625, + "completions/min_length": 107.5, + "completions/min_terminated_length": 107.5, + "epoch": 0.233, + "grad_norm": 9.47978401184082, + "kl": 12.40625, + "learning_rate": 1.894934361602025e-05, + "loss": 1.2803, + "num_tokens": 17581709.0, + "reward": 0.87890625, + "reward_std": 0.791881799697876, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.3125, + "rewards/format_reward/std": 0.4713720977306366, + "rewards/tag_count_reward/mean": 0.56640625, + "rewards/tag_count_reward/std": 0.3775794878602028, + "step": 466, + "token_counts/after_target": 1370.75, + "token_counts/after_think": 30.5, + "token_counts/before_target": 3559.0, + "token_counts/before_think": 980.0 + }, + { + "avg_penalty/after_target": 2.582668662071228, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4261780306696892, + "avg_penalty/before_think": 0.6425759345293045, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 563.0, + "completions/max_terminated_length": 563.0, + "completions/mean_length": 312.5, + "completions/mean_terminated_length": 312.5, + "completions/min_length": 120.75, + "completions/min_terminated_length": 120.75, + "epoch": 0.2335, + "grad_norm": 11.27645492553711, + "kl": 9.5703125, + "learning_rate": 1.8941542368393683e-05, + "loss": 1.1049, + "num_tokens": 17611037.0, + "reward": 1.27734375, + "reward_std": 0.8914021104574203, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.4970766380429268, + "rewards/tag_count_reward/mean": 0.68359375, + "rewards/tag_count_reward/std": 0.4146229512989521, + "step": 467, + "token_counts/after_target": 1056.75, + "token_counts/after_think": 52.75, + "token_counts/before_target": 3056.0, + "token_counts/before_think": 834.5 + }, + { + "avg_penalty/after_target": 2.7109075784683228, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.42135296389460564, + "avg_penalty/before_think": 0.5924332290887833, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.5, + "completions/max_terminated_length": 646.5, + "completions/mean_length": 331.0625, + "completions/mean_terminated_length": 331.0625, + "completions/min_length": 125.5, + "completions/min_terminated_length": 125.5, + "epoch": 0.234, + "grad_norm": 14.63255786895752, + "kl": 11.8359375, + "learning_rate": 1.893371388327838e-05, + "loss": 1.381, + "num_tokens": 17642705.0, + "reward": 1.48828125, + "reward_std": 0.9851079881191254, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.11180340498685837, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.47354350984096527, + "rewards/tag_count_reward/mean": 0.69140625, + "rewards/tag_count_reward/std": 0.43376143276691437, + "step": 468, + "token_counts/after_target": 1431.75, + "token_counts/after_think": 43.5, + "token_counts/before_target": 3193.25, + "token_counts/before_think": 628.5 + }, + { + "avg_penalty/after_target": 1.9942475259304047, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.46720658987760544, + "avg_penalty/before_think": 0.9005666673183441, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 684.25, + "completions/max_terminated_length": 684.25, + "completions/mean_length": 379.453125, + "completions/mean_terminated_length": 379.453125, + "completions/min_length": 150.25, + "completions/min_terminated_length": 150.25, + "epoch": 0.2345, + "grad_norm": 6.522617340087891, + "kl": 11.2578125, + "learning_rate": 1.892585818452126e-05, + "loss": 1.249, + "num_tokens": 17678430.0, + "reward": 1.46875, + "reward_std": 0.8042024970054626, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4260597825050354, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.3888445496559143, + "step": 469, + "token_counts/after_target": 1490.75, + "token_counts/after_think": 114.0, + "token_counts/before_target": 3452.75, + "token_counts/before_think": 1013.75 + }, + { + "avg_penalty/after_target": 2.357207775115967, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.44978077709674835, + "avg_penalty/before_think": 0.628126822412014, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.25, + "completions/max_terminated_length": 568.25, + "completions/mean_length": 337.453125, + "completions/mean_terminated_length": 337.453125, + "completions/min_length": 171.5, + "completions/min_terminated_length": 171.5, + "epoch": 0.235, + "grad_norm": 4.7039971351623535, + "kl": 13.625, + "learning_rate": 1.8917975296052143e-05, + "loss": 1.3291, + "num_tokens": 17709147.0, + "reward": 1.4453125, + "reward_std": 0.8622294962406158, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.45283494144678116, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.41880372166633606, + "step": 470, + "token_counts/after_target": 1177.25, + "token_counts/after_think": 122.25, + "token_counts/before_target": 3208.0, + "token_counts/before_think": 891.75 + }, + { + "avg_penalty/after_target": 2.374701291322708, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4368171840906143, + "avg_penalty/before_think": 0.5251699462532997, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 711.5, + "completions/max_terminated_length": 662.75, + "completions/mean_length": 354.453125, + "completions/mean_terminated_length": 344.40834045410156, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.2355, + "grad_norm": 16.311933517456055, + "kl": 28.21875, + "learning_rate": 1.891006524188368e-05, + "loss": 1.9117, + "num_tokens": 17741800.0, + "reward": 1.13671875, + "reward_std": 0.9475666284561157, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.53125, + "rewards/format_reward/std": 0.5059641748666763, + "rewards/tag_count_reward/mean": 0.60546875, + "rewards/tag_count_reward/std": 0.4593823030591011, + "step": 471, + "token_counts/after_target": 1318.0, + "token_counts/after_think": 39.25, + "token_counts/before_target": 3591.0, + "token_counts/before_think": 723.0 + }, + { + "avg_penalty/after_target": 3.1314035058021545, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.29792607948184013, + "avg_penalty/before_think": 0.5441004633903503, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.5, + "completions/max_terminated_length": 502.5, + "completions/mean_length": 281.6875, + "completions/mean_terminated_length": 281.6875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.236, + "grad_norm": 11.530243873596191, + "kl": 18.3125, + "learning_rate": 1.8902128046111267e-05, + "loss": 1.3245, + "num_tokens": 17766964.0, + "reward": 1.38671875, + "reward_std": 0.8837085217237473, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.45129410922527313, + "rewards/tag_count_reward/mean": 0.69921875, + "rewards/tag_count_reward/std": 0.43479710817337036, + "step": 472, + "token_counts/after_target": 851.25, + "token_counts/after_think": 62.5, + "token_counts/before_target": 2643.25, + "token_counts/before_think": 950.0 + }, + { + "avg_penalty/after_target": 2.504785716533661, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.5128302425146103, + "avg_penalty/before_think": 0.7939220666885376, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 806.0, + "completions/max_terminated_length": 806.0, + "completions/mean_length": 416.09375, + "completions/mean_terminated_length": 416.09375, + "completions/min_length": 111.75, + "completions/min_terminated_length": 111.75, + "epoch": 0.2365, + "grad_norm": 15.258295059204102, + "kl": 27.46875, + "learning_rate": 1.889416373291298e-05, + "loss": 2.0103, + "num_tokens": 17804474.0, + "reward": 1.26953125, + "reward_std": 0.921282172203064, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.4939897432923317, + "rewards/tag_count_reward/mean": 0.66015625, + "rewards/tag_count_reward/std": 0.44260910898447037, + "step": 473, + "token_counts/after_target": 1940.0, + "token_counts/after_think": 25.5, + "token_counts/before_target": 3821.0, + "token_counts/before_think": 871.0 + }, + { + "avg_penalty/after_target": 2.5525651574134827, + "avg_penalty/after_think": 3.8921857476234436, + "avg_penalty/before_target": 0.38494428992271423, + "avg_penalty/before_think": 0.8821130990982056, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 670.0, + "completions/max_terminated_length": 670.0, + "completions/mean_length": 334.0, + "completions/mean_terminated_length": 334.0, + "completions/min_length": 82.75, + "completions/min_terminated_length": 82.75, + "epoch": 0.237, + "grad_norm": 13.798359870910645, + "kl": 20.9375, + "learning_rate": 1.888617232654949e-05, + "loss": 1.5134, + "num_tokens": 17835978.0, + "reward": 1.4375, + "reward_std": 0.9976639598608017, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.125, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.49244368076324463, + "rewards/tag_count_reward/mean": 0.703125, + "rewards/tag_count_reward/std": 0.44189538806676865, + "step": 474, + "token_counts/after_target": 1176.0, + "token_counts/after_think": 69.25, + "token_counts/before_target": 3298.0, + "token_counts/before_think": 800.75 + }, + { + "avg_penalty/after_target": 2.1988877654075623, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.40045731514692307, + "avg_penalty/before_think": 0.8191462010145187, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 649.0, + "completions/max_terminated_length": 649.0, + "completions/mean_length": 327.90625, + "completions/mean_terminated_length": 327.90625, + "completions/min_length": 119.5, + "completions/min_terminated_length": 119.5, + "epoch": 0.2375, + "grad_norm": 14.86380672454834, + "kl": 23.96875, + "learning_rate": 1.8878153851364013e-05, + "loss": 1.6274, + "num_tokens": 17864756.0, + "reward": 1.27734375, + "reward_std": 0.9073075205087662, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.5040994435548782, + "rewards/tag_count_reward/mean": 0.68359375, + "rewards/tag_count_reward/std": 0.42727837711572647, + "step": 475, + "token_counts/after_target": 1157.75, + "token_counts/after_think": 30.75, + "token_counts/before_target": 3321.0, + "token_counts/before_think": 737.0 + }, + { + "avg_penalty/after_target": 2.7354370653629303, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.39447806030511856, + "avg_penalty/before_think": 0.8917139619588852, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 627.5, + "completions/max_terminated_length": 627.5, + "completions/mean_length": 322.109375, + "completions/mean_terminated_length": 322.109375, + "completions/min_length": 166.25, + "completions/min_terminated_length": 166.25, + "epoch": 0.238, + "grad_norm": 11.065686225891113, + "kl": 19.125, + "learning_rate": 1.887010833178222e-05, + "loss": 1.4355, + "num_tokens": 17894443.0, + "reward": 1.39453125, + "reward_std": 0.8800686448812485, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.48935678601264954, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.41386549919843674, + "step": 476, + "token_counts/after_target": 990.0, + "token_counts/after_think": 45.0, + "token_counts/before_target": 3391.0, + "token_counts/before_think": 727.75 + }, + { + "avg_penalty/after_target": 2.5334752798080444, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.40887274593114853, + "avg_penalty/before_think": 0.5318737179040909, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 291.53125, + "completions/mean_terminated_length": 291.53125, + "completions/min_length": 130.5, + "completions/min_terminated_length": 130.5, + "epoch": 0.2385, + "grad_norm": 7.748604774475098, + "kl": 11.359375, + "learning_rate": 1.8862035792312148e-05, + "loss": 1.083, + "num_tokens": 17927453.0, + "reward": 1.453125, + "reward_std": 0.9081558138132095, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4682852029800415, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.43131309747695923, + "step": 477, + "token_counts/after_target": 1033.25, + "token_counts/after_think": 21.75, + "token_counts/before_target": 2886.5, + "token_counts/before_think": 723.0 + }, + { + "avg_penalty/after_target": 2.3083094656467438, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3803320899605751, + "avg_penalty/before_think": 0.7264713793992996, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.0, + "completions/max_terminated_length": 577.0, + "completions/mean_length": 325.265625, + "completions/mean_terminated_length": 325.265625, + "completions/min_length": 179.25, + "completions/min_terminated_length": 179.25, + "epoch": 0.239, + "grad_norm": 4.600127220153809, + "kl": 9.72265625, + "learning_rate": 1.885393625754416e-05, + "loss": 0.9967, + "num_tokens": 17958110.0, + "reward": 1.5234375, + "reward_std": 0.802953690290451, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4462348371744156, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.37556950002908707, + "step": 478, + "token_counts/after_target": 989.75, + "token_counts/after_think": 35.75, + "token_counts/before_target": 3379.0, + "token_counts/before_think": 799.75 + }, + { + "avg_penalty/after_target": 2.4872187972068787, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.42186935245990753, + "avg_penalty/before_think": 0.7131504565477371, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 659.25, + "completions/max_terminated_length": 659.25, + "completions/mean_length": 320.34375, + "completions/mean_terminated_length": 320.34375, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.2395, + "grad_norm": 13.641833305358887, + "kl": 9.203125, + "learning_rate": 1.884580975215084e-05, + "loss": 1.1787, + "num_tokens": 17988980.0, + "reward": 1.5546875, + "reward_std": 0.7801307588815689, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.43655145168304443, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.3594619780778885, + "step": 479, + "token_counts/after_target": 1254.75, + "token_counts/after_think": 81.25, + "token_counts/before_target": 2974.25, + "token_counts/before_think": 815.25 + }, + { + "avg_penalty/after_target": 2.4655682146549225, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.40437670797109604, + "avg_penalty/before_think": 0.7820865660905838, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 672.25, + "completions/max_terminated_length": 672.25, + "completions/mean_length": 368.234375, + "completions/mean_terminated_length": 368.234375, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.24, + "grad_norm": 10.668067932128906, + "kl": 11.34375, + "learning_rate": 1.8837656300886937e-05, + "loss": 1.2722, + "num_tokens": 18020963.0, + "reward": 1.5390625, + "reward_std": 0.7764152884483337, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.46296359598636627, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.3418859578669071, + "step": 480, + "token_counts/after_target": 1423.25, + "token_counts/after_think": 26.25, + "token_counts/before_target": 3666.75, + "token_counts/before_think": 775.5 + }, + { + "avg_penalty/after_target": 2.71776682138443, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.42114319652318954, + "avg_penalty/before_think": 0.5970511883497238, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.75, + "completions/max_terminated_length": 690.75, + "completions/mean_length": 305.34375, + "completions/mean_terminated_length": 305.34375, + "completions/min_length": 106.75, + "completions/min_terminated_length": 106.75, + "epoch": 0.2405, + "grad_norm": 7.309685707092285, + "kl": 11.453125, + "learning_rate": 1.8829475928589272e-05, + "loss": 1.2559, + "num_tokens": 18051337.0, + "reward": 1.55078125, + "reward_std": 0.8049298971891403, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.43655145168304443, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.38382384926080704, + "step": 481, + "token_counts/after_target": 1119.5, + "token_counts/after_think": 42.0, + "token_counts/before_target": 2904.5, + "token_counts/before_think": 819.5 + }, + { + "avg_penalty/after_target": 2.0592288076877594, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4619966074824333, + "avg_penalty/before_think": 0.7222221344709396, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 595.25, + "completions/max_terminated_length": 595.25, + "completions/mean_length": 331.890625, + "completions/mean_terminated_length": 331.890625, + "completions/min_length": 96.5, + "completions/min_terminated_length": 96.5, + "epoch": 0.241, + "grad_norm": 5.586615562438965, + "kl": 16.46875, + "learning_rate": 1.882126866017668e-05, + "loss": 1.3412, + "num_tokens": 18082402.0, + "reward": 1.3359375, + "reward_std": 0.8859919607639313, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.6796875, + "rewards/tag_count_reward/std": 0.43884579092264175, + "step": 482, + "token_counts/after_target": 1181.5, + "token_counts/after_think": 80.75, + "token_counts/before_target": 3317.5, + "token_counts/before_think": 730.5 + }, + { + "avg_penalty/after_target": 2.6499202251434326, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.38249240815639496, + "avg_penalty/before_think": 0.7532349973917007, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.25, + "completions/max_terminated_length": 579.25, + "completions/mean_length": 298.015625, + "completions/mean_terminated_length": 298.015625, + "completions/min_length": 114.75, + "completions/min_terminated_length": 114.75, + "epoch": 0.2415, + "grad_norm": 13.315713882446289, + "kl": 21.421875, + "learning_rate": 1.8813034520649923e-05, + "loss": 1.6057, + "num_tokens": 18112451.0, + "reward": 1.28125, + "reward_std": 0.821235179901123, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.4459725022315979, + "rewards/tag_count_reward/mean": 0.671875, + "rewards/tag_count_reward/std": 0.39063216745853424, + "step": 483, + "token_counts/after_target": 982.5, + "token_counts/after_think": 47.25, + "token_counts/before_target": 3071.75, + "token_counts/before_think": 666.75 + }, + { + "avg_penalty/after_target": 2.8965232968330383, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.39491845667362213, + "avg_penalty/before_think": 0.6364034861326218, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 639.75, + "completions/max_terminated_length": 546.25, + "completions/mean_length": 280.78125, + "completions/mean_terminated_length": 269.1572952270508, + "completions/min_length": 89.75, + "completions/min_terminated_length": 89.75, + "epoch": 0.242, + "grad_norm": 7.074455261230469, + "kl": 20.0625, + "learning_rate": 1.880477353509162e-05, + "loss": 1.5746, + "num_tokens": 18140213.0, + "reward": 1.390625, + "reward_std": 0.8522834479808807, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.4597553312778473, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.4102940186858177, + "step": 484, + "token_counts/after_target": 864.75, + "token_counts/after_think": 27.75, + "token_counts/before_target": 2723.75, + "token_counts/before_think": 876.25 + }, + { + "avg_penalty/after_target": 2.3881844878196716, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4116356670856476, + "avg_penalty/before_think": 0.6812748312950134, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 691.0, + "completions/max_terminated_length": 691.0, + "completions/mean_length": 312.734375, + "completions/mean_terminated_length": 312.734375, + "completions/min_length": 144.75, + "completions/min_terminated_length": 144.75, + "epoch": 0.2425, + "grad_norm": 5.8163628578186035, + "kl": 20.5625, + "learning_rate": 1.879648572866617e-05, + "loss": 1.643, + "num_tokens": 18170932.0, + "reward": 1.3984375, + "reward_std": 0.8952106982469559, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.48025963455438614, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.4207920432090759, + "step": 485, + "token_counts/after_target": 1065.5, + "token_counts/after_think": 61.75, + "token_counts/before_target": 2963.75, + "token_counts/before_think": 912.75 + }, + { + "avg_penalty/after_target": 2.31700336933136, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.5151621550321579, + "avg_penalty/before_think": 0.7623979300260544, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.25, + "completions/max_terminated_length": 789.25, + "completions/mean_length": 382.25, + "completions/mean_terminated_length": 382.25, + "completions/min_length": 100.75, + "completions/min_terminated_length": 100.75, + "epoch": 0.243, + "grad_norm": 37.76227569580078, + "kl": 29.84375, + "learning_rate": 1.8788171126619653e-05, + "loss": 2.0731, + "num_tokens": 18204196.0, + "reward": 0.92578125, + "reward_std": 0.8154634833335876, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.359375, + "rewards/format_reward/std": 0.4625816270709038, + "rewards/tag_count_reward/mean": 0.55078125, + "rewards/tag_count_reward/std": 0.4008106291294098, + "step": 486, + "token_counts/after_target": 1611.25, + "token_counts/after_think": 43.0, + "token_counts/before_target": 3704.0, + "token_counts/before_think": 757.75 + }, + { + "avg_penalty/after_target": 3.0068922638893127, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.34621408581733704, + "avg_penalty/before_think": 0.6396024823188782, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.75, + "completions/max_terminated_length": 631.75, + "completions/mean_length": 305.703125, + "completions/mean_terminated_length": 305.703125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.2435, + "grad_norm": 5.930392265319824, + "kl": 15.921875, + "learning_rate": 1.8779829754279806e-05, + "loss": 1.5036, + "num_tokens": 18234689.0, + "reward": 1.328125, + "reward_std": 0.938733384013176, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.4939897432923317, + "rewards/tag_count_reward/mean": 0.6875, + "rewards/tag_count_reward/std": 0.4382563382387161, + "step": 487, + "token_counts/after_target": 1089.75, + "token_counts/after_think": 97.5, + "token_counts/before_target": 3087.75, + "token_counts/before_think": 616.25 + }, + { + "avg_penalty/after_target": 2.495954215526581, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.36631541326642036, + "avg_penalty/before_think": 0.5195028558373451, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 655.75, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 284.859375, + "completions/mean_terminated_length": 272.87291717529297, + "completions/min_length": 132.25, + "completions/min_terminated_length": 132.25, + "epoch": 0.244, + "grad_norm": 7.666689395904541, + "kl": 14.484375, + "learning_rate": 1.877146163705589e-05, + "loss": 1.4776, + "num_tokens": 18264568.0, + "reward": 1.375, + "reward_std": 0.8733762204647064, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.47663040459156036, + "rewards/tag_count_reward/mean": 0.71875, + "rewards/tag_count_reward/std": 0.4183848425745964, + "step": 488, + "token_counts/after_target": 800.5, + "token_counts/after_think": 77.5, + "token_counts/before_target": 2738.25, + "token_counts/before_think": 941.5 + }, + { + "avg_penalty/after_target": 2.775174558162689, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.38971439003944397, + "avg_penalty/before_think": 0.5812697783112526, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.5, + "completions/max_terminated_length": 585.5, + "completions/mean_length": 258.015625, + "completions/mean_terminated_length": 258.015625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.2445, + "grad_norm": 8.979992866516113, + "kl": 13.578125, + "learning_rate": 1.8763066800438638e-05, + "loss": 1.4749, + "num_tokens": 18291241.0, + "reward": 1.46875, + "reward_std": 0.8526393324136734, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45247192680835724, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.41406290978193283, + "step": 489, + "token_counts/after_target": 865.25, + "token_counts/after_think": 101.25, + "token_counts/before_target": 2305.0, + "token_counts/before_think": 856.75 + }, + { + "avg_penalty/after_target": 2.7839687168598175, + "avg_penalty/after_think": 3.7983973026275635, + "avg_penalty/before_target": 0.39493510127067566, + "avg_penalty/before_think": 0.6331375241279602, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.5, + "completions/max_terminated_length": 486.5, + "completions/mean_length": 251.15625, + "completions/mean_terminated_length": 251.15625, + "completions/min_length": 69.75, + "completions/min_terminated_length": 69.75, + "epoch": 0.245, + "grad_norm": 6.948336601257324, + "kl": 13.02734375, + "learning_rate": 1.875464527000018e-05, + "loss": 1.4174, + "num_tokens": 18317587.0, + "reward": 1.49609375, + "reward_std": 0.7379840314388275, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4106728211045265, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.3582198694348335, + "step": 490, + "token_counts/after_target": 881.0, + "token_counts/after_think": 100.75, + "token_counts/before_target": 2261.0, + "token_counts/before_think": 775.75 + }, + { + "avg_penalty/after_target": 3.026467651128769, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3468557074666023, + "avg_penalty/before_think": 0.5087414309382439, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.5, + "completions/max_terminated_length": 550.5, + "completions/mean_length": 243.0, + "completions/mean_terminated_length": 243.0, + "completions/min_length": 64.5, + "completions/min_terminated_length": 64.5, + "epoch": 0.2455, + "grad_norm": 9.278817176818848, + "kl": 24.34375, + "learning_rate": 1.874619707139396e-05, + "loss": 1.8588, + "num_tokens": 18344083.0, + "reward": 1.1171875, + "reward_std": 0.9503943920135498, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5123475790023804, + "rewards/tag_count_reward/mean": 0.6015625, + "rewards/tag_count_reward/std": 0.45287593454122543, + "step": 491, + "token_counts/after_target": 715.0, + "token_counts/after_think": 73.75, + "token_counts/before_target": 2508.25, + "token_counts/before_think": 591.0 + }, + { + "avg_penalty/after_target": 2.9111313819885254, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3809000179171562, + "avg_penalty/before_think": 0.32720568403601646, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.75, + "completions/max_terminated_length": 558.75, + "completions/mean_length": 244.0625, + "completions/mean_terminated_length": 244.0625, + "completions/min_length": 67.25, + "completions/min_terminated_length": 67.25, + "epoch": 0.246, + "grad_norm": 14.657790184020996, + "kl": 31.59375, + "learning_rate": 1.8737722230354654e-05, + "loss": 2.3404, + "num_tokens": 18370263.0, + "reward": 1.17578125, + "reward_std": 0.8942522406578064, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.53125, + "rewards/format_reward/std": 0.5018647313117981, + "rewards/tag_count_reward/mean": 0.62890625, + "rewards/tag_count_reward/std": 0.44940024614334106, + "step": 492, + "token_counts/after_target": 876.5, + "token_counts/after_think": 19.0, + "token_counts/before_target": 2410.5, + "token_counts/before_think": 599.0 + }, + { + "avg_penalty/after_target": 2.4560846984386444, + "avg_penalty/after_think": 2.98875629901886, + "avg_penalty/before_target": 0.42596037685871124, + "avg_penalty/before_think": 0.45252418518066406, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 683.75, + "completions/max_terminated_length": 644.5, + "completions/mean_length": 293.328125, + "completions/mean_terminated_length": 282.5875015258789, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.2465, + "grad_norm": 8.134925842285156, + "kl": 28.4375, + "learning_rate": 1.8729220772698096e-05, + "loss": 2.2642, + "num_tokens": 18399948.0, + "reward": 1.19921875, + "reward_std": 0.9493899941444397, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.5018647313117981, + "rewards/tag_count_reward/mean": 0.60546875, + "rewards/tag_count_reward/std": 0.4623773992061615, + "step": 493, + "token_counts/after_target": 1288.0, + "token_counts/after_think": 24.25, + "token_counts/before_target": 2678.5, + "token_counts/before_think": 702.5 + }, + { + "avg_penalty/after_target": 2.8062002062797546, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.43541954457759857, + "avg_penalty/before_think": 0.49209391325712204, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.25, + "completions/max_terminated_length": 600.25, + "completions/mean_length": 228.640625, + "completions/mean_terminated_length": 228.640625, + "completions/min_length": 77.25, + "completions/min_terminated_length": 77.25, + "epoch": 0.247, + "grad_norm": 9.358012199401855, + "kl": 17.171875, + "learning_rate": 1.8720692724321207e-05, + "loss": 1.7712, + "num_tokens": 18424981.0, + "reward": 1.55859375, + "reward_std": 0.7869455069303513, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42867646366357803, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.37580491602420807, + "step": 494, + "token_counts/after_target": 807.0, + "token_counts/after_think": 37.25, + "token_counts/before_target": 1851.0, + "token_counts/before_think": 963.0 + }, + { + "avg_penalty/after_target": 2.713116377592087, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3835354261100292, + "avg_penalty/before_think": 0.5075294747948647, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.25, + "completions/max_terminated_length": 502.25, + "completions/mean_length": 222.09375, + "completions/mean_terminated_length": 222.09375, + "completions/min_length": 75.25, + "completions/min_terminated_length": 75.25, + "epoch": 0.2475, + "grad_norm": 3.382342576980591, + "kl": 21.15625, + "learning_rate": 1.8712138111201898e-05, + "loss": 1.7518, + "num_tokens": 18450075.0, + "reward": 1.43359375, + "reward_std": 0.8687923103570938, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.46513500809669495, + "rewards/tag_count_reward/mean": 0.73046875, + "rewards/tag_count_reward/std": 0.42015938460826874, + "step": 495, + "token_counts/after_target": 668.75, + "token_counts/after_think": 28.25, + "token_counts/before_target": 2094.0, + "token_counts/before_think": 762.5 + }, + { + "avg_penalty/after_target": 2.6361860632896423, + "avg_penalty/after_think": 2.912998378276825, + "avg_penalty/before_target": 0.3329657129943371, + "avg_penalty/before_think": 0.40623053163290024, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.75, + "completions/max_terminated_length": 431.75, + "completions/mean_length": 178.140625, + "completions/mean_terminated_length": 178.140625, + "completions/min_length": 59.25, + "completions/min_terminated_length": 59.25, + "epoch": 0.248, + "grad_norm": 2.5087547302246094, + "kl": 20.0546875, + "learning_rate": 1.8703556959398998e-05, + "loss": 1.6659, + "num_tokens": 18470068.0, + "reward": 1.46875, + "reward_std": 0.8664747327566147, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.41907910257577896, + "step": 496, + "token_counts/after_target": 495.75, + "token_counts/after_think": 32.75, + "token_counts/before_target": 1548.75, + "token_counts/before_think": 773.0 + }, + { + "avg_penalty/after_target": 2.2088237404823303, + "avg_penalty/after_think": 2.8679991364479065, + "avg_penalty/before_target": 0.4102490358054638, + "avg_penalty/before_think": 0.4502784162759781, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.25, + "completions/max_terminated_length": 591.25, + "completions/mean_length": 208.28125, + "completions/mean_terminated_length": 208.28125, + "completions/min_length": 52.25, + "completions/min_terminated_length": 52.25, + "epoch": 0.2485, + "grad_norm": 6.947420597076416, + "kl": 27.375, + "learning_rate": 1.869494929505219e-05, + "loss": 2.141, + "num_tokens": 18498022.0, + "reward": 1.3125, + "reward_std": 0.9759916365146637, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.48558124154806137, + "rewards/tag_count_reward/mean": 0.65625, + "rewards/tag_count_reward/std": 0.47453171014785767, + "step": 497, + "token_counts/after_target": 700.0, + "token_counts/after_think": 29.5, + "token_counts/before_target": 2138.25, + "token_counts/before_think": 464.75 + }, + { + "avg_penalty/after_target": 2.2083844542503357, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.35874392837285995, + "avg_penalty/before_think": 0.4506862983107567, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.25, + "completions/max_terminated_length": 425.25, + "completions/mean_length": 156.265625, + "completions/mean_terminated_length": 156.265625, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.249, + "grad_norm": 5.611621379852295, + "kl": 17.875, + "learning_rate": 1.8686315144381914e-05, + "loss": 1.6223, + "num_tokens": 18517719.0, + "reward": 1.59375, + "reward_std": 0.8901752531528473, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4361884370446205, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.41336873918771744, + "step": 498, + "token_counts/after_target": 297.25, + "token_counts/after_think": 130.0, + "token_counts/before_target": 1546.0, + "token_counts/before_think": 527.0 + }, + { + "avg_penalty/after_target": 2.9985433518886566, + "avg_penalty/after_think": 2.919463038444519, + "avg_penalty/before_target": 0.29377003014087677, + "avg_penalty/before_think": 0.33437415212392807, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.25, + "completions/max_terminated_length": 467.25, + "completions/mean_length": 175.34375, + "completions/mean_terminated_length": 175.34375, + "completions/min_length": 50.75, + "completions/min_terminated_length": 50.75, + "epoch": 0.2495, + "grad_norm": 4.956855773925781, + "kl": 22.3125, + "learning_rate": 1.8677654533689287e-05, + "loss": 1.963, + "num_tokens": 18539693.0, + "reward": 1.40625, + "reward_std": 0.9137242883443832, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4761601909995079, + "rewards/tag_count_reward/mean": 0.71875, + "rewards/tag_count_reward/std": 0.45005790144205093, + "step": 499, + "token_counts/after_target": 505.75, + "token_counts/after_think": 23.5, + "token_counts/before_target": 1689.0, + "token_counts/before_think": 587.25 + }, + { + "avg_penalty/after_target": 2.273392528295517, + "avg_penalty/after_think": 1.7584523558616638, + "avg_penalty/before_target": 0.4248560816049576, + "avg_penalty/before_think": 0.3544396348297596, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 623.25, + "completions/max_terminated_length": 623.25, + "completions/mean_length": 219.828125, + "completions/mean_terminated_length": 219.828125, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.25, + "grad_norm": 4.553860187530518, + "kl": 25.734375, + "learning_rate": 1.866896748935603e-05, + "loss": 2.1916, + "num_tokens": 18562434.0, + "reward": 1.47265625, + "reward_std": 0.840006023645401, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.42206869274377823, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.4183944836258888, + "step": 500, + "token_counts/after_target": 762.5, + "token_counts/after_think": 22.0, + "token_counts/before_target": 1939.75, + "token_counts/before_think": 793.0 + }, + { + "avg_penalty/after_target": 2.6862822771072388, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4299360364675522, + "avg_penalty/before_think": 0.2957179993391037, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 614.75, + "completions/max_terminated_length": 543.75, + "completions/mean_length": 198.03125, + "completions/mean_terminated_length": 185.96667098999023, + "completions/min_length": 65.75, + "completions/min_terminated_length": 65.75, + "epoch": 0.2505, + "grad_norm": 11.951992988586426, + "kl": 32.28125, + "learning_rate": 1.866025403784439e-05, + "loss": 2.3807, + "num_tokens": 18584756.0, + "reward": 1.40234375, + "reward_std": 0.9080160558223724, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.466681070625782, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.4359918460249901, + "step": 501, + "token_counts/after_target": 595.0, + "token_counts/after_think": 36.25, + "token_counts/before_target": 1927.25, + "token_counts/before_think": 610.0 + }, + { + "avg_penalty/after_target": 1.4910496771335602, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4462617449462414, + "avg_penalty/before_think": 0.3511653542518616, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 218.84375, + "completions/mean_terminated_length": 218.84375, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.251, + "grad_norm": 15.052179336547852, + "kl": 31.53125, + "learning_rate": 1.8651514205697046e-05, + "loss": 2.2144, + "num_tokens": 18607658.0, + "reward": 1.515625, + "reward_std": 0.8244843482971191, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4462348371744156, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.39288315176963806, + "step": 502, + "token_counts/after_target": 447.75, + "token_counts/after_think": 46.75, + "token_counts/before_target": 2228.75, + "token_counts/before_think": 778.25 + }, + { + "avg_penalty/after_target": 2.9586506485939026, + "avg_penalty/after_think": 3.0636478662490845, + "avg_penalty/before_target": 0.31231042370200157, + "avg_penalty/before_think": 0.5467021241784096, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 220.0, + "completions/mean_terminated_length": 220.0, + "completions/min_length": 84.5, + "completions/min_terminated_length": 84.5, + "epoch": 0.2515, + "grad_norm": 11.034563064575195, + "kl": 31.1875, + "learning_rate": 1.864274801953705e-05, + "loss": 2.334, + "num_tokens": 18634282.0, + "reward": 1.4453125, + "reward_std": 0.8508827686309814, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4704566150903702, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.4092979356646538, + "step": 503, + "token_counts/after_target": 552.25, + "token_counts/after_think": 40.25, + "token_counts/before_target": 2110.25, + "token_counts/before_think": 817.25 + }, + { + "avg_penalty/after_target": 2.8197194933891296, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.38066961988806725, + "avg_penalty/before_think": 0.47894367575645447, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 674.5, + "completions/max_terminated_length": 667.0, + "completions/mean_length": 245.265625, + "completions/mean_terminated_length": 233.79479217529297, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.252, + "grad_norm": 2.959811210632324, + "kl": 21.421875, + "learning_rate": 1.8633955506067717e-05, + "loss": 1.9495, + "num_tokens": 18658763.0, + "reward": 1.609375, + "reward_std": 0.7122840881347656, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4000816270709038, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.33160049468278885, + "step": 504, + "token_counts/after_target": 797.25, + "token_counts/after_think": 66.5, + "token_counts/before_target": 2157.25, + "token_counts/before_think": 903.25 + }, + { + "avg_penalty/after_target": 2.196149468421936, + "avg_penalty/after_think": 3.7642509937286377, + "avg_penalty/before_target": 0.4766615442931652, + "avg_penalty/before_think": 0.5759285613894463, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 690.25, + "completions/max_terminated_length": 613.75, + "completions/mean_length": 260.078125, + "completions/mean_terminated_length": 237.91741180419922, + "completions/min_length": 89.5, + "completions/min_terminated_length": 89.5, + "epoch": 0.2525, + "grad_norm": 3.430302143096924, + "kl": 25.84375, + "learning_rate": 1.8625136692072577e-05, + "loss": 2.1539, + "num_tokens": 18685552.0, + "reward": 1.5078125, + "reward_std": 0.8548012673854828, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.125, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4424592927098274, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.3925502225756645, + "step": 505, + "token_counts/after_target": 909.75, + "token_counts/after_think": 141.5, + "token_counts/before_target": 2297.5, + "token_counts/before_think": 812.5 + }, + { + "avg_penalty/after_target": 3.015835762023926, + "avg_penalty/after_think": 2.504871129989624, + "avg_penalty/before_target": 0.4123072624206543, + "avg_penalty/before_think": 0.38833723962306976, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.75, + "completions/max_terminated_length": 488.75, + "completions/mean_length": 190.125, + "completions/mean_terminated_length": 190.125, + "completions/min_length": 76.75, + "completions/min_terminated_length": 76.75, + "epoch": 0.253, + "grad_norm": 10.216493606567383, + "kl": 20.4375, + "learning_rate": 1.861629160441526e-05, + "loss": 1.9436, + "num_tokens": 18706488.0, + "reward": 1.47265625, + "reward_std": 0.8203525096178055, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4534844756126404, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.3953571915626526, + "step": 506, + "token_counts/after_target": 707.5, + "token_counts/after_think": 21.75, + "token_counts/before_target": 1578.0, + "token_counts/before_think": 734.75 + }, + { + "avg_penalty/after_target": 2.823640286922455, + "avg_penalty/after_think": 2.72855007648468, + "avg_penalty/before_target": 0.3246099464595318, + "avg_penalty/before_think": 0.4588517025113106, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 604.75, + "completions/max_terminated_length": 604.75, + "completions/mean_length": 213.125, + "completions/mean_terminated_length": 213.125, + "completions/min_length": 81.5, + "completions/min_terminated_length": 81.5, + "epoch": 0.2535, + "grad_norm": 6.860198974609375, + "kl": 19.09375, + "learning_rate": 1.860742027003944e-05, + "loss": 1.771, + "num_tokens": 18729440.0, + "reward": 1.484375, + "reward_std": 0.8433276116847992, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4339347705245018, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.4122864380478859, + "step": 507, + "token_counts/after_target": 720.0, + "token_counts/after_think": 38.25, + "token_counts/before_target": 1867.75, + "token_counts/before_think": 784.0 + }, + { + "avg_penalty/after_target": 2.2853586971759796, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3730088248848915, + "avg_penalty/before_think": 0.3736455738544464, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.75, + "completions/max_terminated_length": 431.75, + "completions/mean_length": 182.046875, + "completions/mean_terminated_length": 182.046875, + "completions/min_length": 77.25, + "completions/min_terminated_length": 77.25, + "epoch": 0.254, + "grad_norm": 6.39637565612793, + "kl": 16.546875, + "learning_rate": 1.8598522715968736e-05, + "loss": 1.5534, + "num_tokens": 18749155.0, + "reward": 1.546875, + "reward_std": 0.7966432422399521, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4440634250640869, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.37976596504449844, + "step": 508, + "token_counts/after_target": 428.25, + "token_counts/after_think": 35.75, + "token_counts/before_target": 1696.25, + "token_counts/before_think": 752.5 + }, + { + "avg_penalty/after_target": 2.8102214336395264, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4172339364886284, + "avg_penalty/before_think": 0.3863335847854614, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 583.25, + "completions/max_terminated_length": 474.75, + "completions/mean_length": 197.359375, + "completions/mean_terminated_length": 184.73229598999023, + "completions/min_length": 43.5, + "completions/min_terminated_length": 43.5, + "epoch": 0.2545, + "grad_norm": 5.652064800262451, + "kl": 27.046875, + "learning_rate": 1.8589598969306646e-05, + "loss": 2.1949, + "num_tokens": 18772794.0, + "reward": 1.2890625, + "reward_std": 0.8898640424013138, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.4581565484404564, + "rewards/tag_count_reward/mean": 0.6640625, + "rewards/tag_count_reward/std": 0.4451170712709427, + "step": 509, + "token_counts/after_target": 698.25, + "token_counts/after_think": 11.5, + "token_counts/before_target": 1814.75, + "token_counts/before_think": 633.25 + }, + { + "avg_penalty/after_target": 3.053670108318329, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3123382404446602, + "avg_penalty/before_think": 0.3952169865369797, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 200.765625, + "completions/mean_terminated_length": 200.765625, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.255, + "grad_norm": 3.4758810997009277, + "kl": 19.41015625, + "learning_rate": 1.858064905723645e-05, + "loss": 1.6866, + "num_tokens": 18795355.0, + "reward": 1.5703125, + "reward_std": 0.6550359278917313, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.3507782220840454, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.3261060491204262, + "step": 510, + "token_counts/after_target": 528.5, + "token_counts/after_think": 43.5, + "token_counts/before_target": 1940.0, + "token_counts/before_think": 700.25 + }, + { + "avg_penalty/after_target": 2.6331475377082825, + "avg_penalty/after_think": 3.3171096742153168, + "avg_penalty/before_target": 0.3172594681382179, + "avg_penalty/before_think": 0.5147927552461624, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.25, + "completions/max_terminated_length": 467.25, + "completions/mean_length": 157.015625, + "completions/mean_terminated_length": 157.015625, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.2555, + "grad_norm": 4.504714012145996, + "kl": 17.296875, + "learning_rate": 1.8571673007021124e-05, + "loss": 1.5735, + "num_tokens": 18814524.0, + "reward": 1.58203125, + "reward_std": 0.8015945851802826, + "rewards/accuracy_reward/mean": NaN, + "rewards/accuracy_reward/std": NaN, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4260597825050354, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.35700901597738266, + "step": 511, + "token_counts/after_target": 347.25, + "token_counts/after_think": 37.5, + "token_counts/before_target": 1490.75, + "token_counts/before_think": 636.75 + }, + { + "avg_penalty/after_target": 1.9147863686084747, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.41695933043956757, + "avg_penalty/before_think": 0.4221015200018883, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.75, + "completions/max_terminated_length": 488.75, + "completions/mean_length": 173.28125, + "completions/mean_terminated_length": 173.28125, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.256, + "grad_norm": 3.4071226119995117, + "kl": 20.65625, + "learning_rate": 1.8562670846003283e-05, + "loss": 1.777, + "num_tokens": 18837326.0, + "reward": 1.52734375, + "reward_std": 0.8106707185506821, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4519384130835533, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.37838173657655716, + "step": 512, + "token_counts/after_target": 481.25, + "token_counts/after_think": 49.0, + "token_counts/before_target": 1531.75, + "token_counts/before_think": 710.5 + }, + { + "avg_penalty/after_target": 2.437273621559143, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3743433728814125, + "avg_penalty/before_think": 0.4318460375070572, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.0, + "completions/max_terminated_length": 521.0, + "completions/mean_length": 169.953125, + "completions/mean_terminated_length": 169.953125, + "completions/min_length": 43.25, + "completions/min_terminated_length": 43.25, + "epoch": 0.2565, + "grad_norm": 3.820119857788086, + "kl": 19.796875, + "learning_rate": 1.855364260160507e-05, + "loss": 1.8127, + "num_tokens": 18856619.0, + "reward": 1.58203125, + "reward_std": 0.7547585815191269, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.44091323018074036, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.343147661536932, + "step": 513, + "token_counts/after_target": 494.25, + "token_counts/after_think": 31.5, + "token_counts/before_target": 1608.0, + "token_counts/before_think": 585.5 + }, + { + "avg_penalty/after_target": 1.931053638458252, + "avg_penalty/after_think": 2.983125329017639, + "avg_penalty/before_target": 0.3678857386112213, + "avg_penalty/before_think": 0.41199037432670593, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.5, + "completions/max_terminated_length": 511.5, + "completions/mean_length": 181.03125, + "completions/mean_terminated_length": 181.03125, + "completions/min_length": 60.75, + "completions/min_terminated_length": 60.75, + "epoch": 0.257, + "grad_norm": 13.360651016235352, + "kl": 27.375, + "learning_rate": 1.8544588301328077e-05, + "loss": 1.9307, + "num_tokens": 18875677.0, + "reward": 1.33203125, + "reward_std": 0.8836760520935059, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.48456869274377823, + "rewards/tag_count_reward/mean": 0.70703125, + "rewards/tag_count_reward/std": 0.4303411990404129, + "step": 514, + "token_counts/after_target": 397.5, + "token_counts/after_think": 81.25, + "token_counts/before_target": 1720.75, + "token_counts/before_think": 697.0 + }, + { + "avg_penalty/after_target": 2.384417772293091, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.22387275472283363, + "avg_penalty/before_think": 0.46732592582702637, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 195.46875, + "completions/mean_terminated_length": 195.46875, + "completions/min_length": 62.5, + "completions/min_terminated_length": 62.5, + "epoch": 0.2575, + "grad_norm": 23.82455825805664, + "kl": 27.3828125, + "learning_rate": 1.8535507972753275e-05, + "loss": 1.7141, + "num_tokens": 18897211.0, + "reward": 1.28515625, + "reward_std": 0.739338681101799, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.3852732330560684, + "rewards/tag_count_reward/mean": 0.66015625, + "rewards/tag_count_reward/std": 0.3615225926041603, + "step": 515, + "token_counts/after_target": 225.0, + "token_counts/after_think": 56.5, + "token_counts/before_target": 2128.5, + "token_counts/before_think": 717.5 + }, + { + "avg_penalty/after_target": 2.7866779267787933, + "avg_penalty/after_think": 2.970706820487976, + "avg_penalty/before_target": 0.3286826126277447, + "avg_penalty/before_think": 0.4811975806951523, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.75, + "completions/max_terminated_length": 457.75, + "completions/mean_length": 170.078125, + "completions/mean_terminated_length": 170.078125, + "completions/min_length": 30.5, + "completions/min_terminated_length": 30.5, + "epoch": 0.258, + "grad_norm": 4.035948753356934, + "kl": 20.3125, + "learning_rate": 1.8526401643540924e-05, + "loss": 1.6848, + "num_tokens": 18916064.0, + "reward": 1.5234375, + "reward_std": 0.8599309027194977, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4462348371744156, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.4058510288596153, + "step": 516, + "token_counts/after_target": 415.0, + "token_counts/after_think": 45.0, + "token_counts/before_target": 1443.25, + "token_counts/before_think": 818.0 + }, + { + "avg_penalty/after_target": 2.5453184247016907, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.2932310216128826, + "avg_penalty/before_think": 0.3920331858098507, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.75, + "completions/max_terminated_length": 418.75, + "completions/mean_length": 170.921875, + "completions/mean_terminated_length": 170.921875, + "completions/min_length": 56.25, + "completions/min_terminated_length": 56.25, + "epoch": 0.2585, + "grad_norm": 4.275979042053223, + "kl": 18.875, + "learning_rate": 1.851726934143048e-05, + "loss": 1.5254, + "num_tokens": 18936699.0, + "reward": 1.44140625, + "reward_std": 0.8777464777231216, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.47354350984096527, + "rewards/tag_count_reward/mean": 0.75390625, + "rewards/tag_count_reward/std": 0.4171416908502579, + "step": 517, + "token_counts/after_target": 355.75, + "token_counts/after_think": 65.75, + "token_counts/before_target": 1523.5, + "token_counts/before_think": 789.75 + }, + { + "avg_penalty/after_target": 2.6317877769470215, + "avg_penalty/after_think": 2.763806641101837, + "avg_penalty/before_target": 0.3638836480677128, + "avg_penalty/before_think": 0.37599652260541916, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.75, + "completions/max_terminated_length": 506.75, + "completions/mean_length": 204.359375, + "completions/mean_terminated_length": 204.359375, + "completions/min_length": 55.25, + "completions/min_terminated_length": 55.25, + "epoch": 0.259, + "grad_norm": 6.540792942047119, + "kl": 20.4375, + "learning_rate": 1.8508111094240516e-05, + "loss": 1.7783, + "num_tokens": 18962370.0, + "reward": 1.37109375, + "reward_std": 0.8706509470939636, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.4788651168346405, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.4175753816962242, + "step": 518, + "token_counts/after_target": 628.5, + "token_counts/after_think": 17.25, + "token_counts/before_target": 1766.75, + "token_counts/before_think": 857.25 + }, + { + "avg_penalty/after_target": 3.0624217987060547, + "avg_penalty/after_think": 2.7493752241134644, + "avg_penalty/before_target": 0.2584638185799122, + "avg_penalty/before_think": 0.31054581329226494, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.5, + "completions/max_terminated_length": 464.5, + "completions/mean_length": 169.640625, + "completions/mean_terminated_length": 169.640625, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.2595, + "grad_norm": 9.071219444274902, + "kl": 14.5859375, + "learning_rate": 1.849892692986864e-05, + "loss": 1.4433, + "num_tokens": 18982395.0, + "reward": 1.5390625, + "reward_std": 0.7863858193159103, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4304215610027313, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.37847501039505005, + "step": 519, + "token_counts/after_target": 349.75, + "token_counts/after_think": 24.75, + "token_counts/before_target": 1585.0, + "token_counts/before_think": 754.75 + }, + { + "avg_penalty/after_target": 2.1484167873859406, + "avg_penalty/after_think": 3.7605292797088623, + "avg_penalty/before_target": 0.3326854184269905, + "avg_penalty/before_think": 0.4814668446779251, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.25, + "completions/max_terminated_length": 347.25, + "completions/mean_length": 171.5, + "completions/mean_terminated_length": 171.5, + "completions/min_length": 55.25, + "completions/min_terminated_length": 55.25, + "epoch": 0.26, + "grad_norm": 12.02331256866455, + "kl": 10.546875, + "learning_rate": 1.8489716876291417e-05, + "loss": 1.1852, + "num_tokens": 19001387.0, + "reward": 1.55859375, + "reward_std": 0.7353402003645897, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.39789126068353653, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.3538784608244896, + "step": 520, + "token_counts/after_target": 312.25, + "token_counts/after_think": 91.5, + "token_counts/before_target": 1681.5, + "token_counts/before_think": 658.75 + }, + { + "avg_penalty/after_target": 2.945696532726288, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3035683371126652, + "avg_penalty/before_think": 0.4127676859498024, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 538.25, + "completions/max_terminated_length": 538.25, + "completions/mean_length": 177.953125, + "completions/mean_terminated_length": 177.953125, + "completions/min_length": 64.5, + "completions/min_terminated_length": 64.5, + "epoch": 0.2605, + "grad_norm": 12.929813385009766, + "kl": 12.3671875, + "learning_rate": 1.848048096156426e-05, + "loss": 1.4329, + "num_tokens": 19022376.0, + "reward": 1.63671875, + "reward_std": 0.7904085367918015, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42867646366357803, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.34431903064250946, + "step": 521, + "token_counts/after_target": 504.75, + "token_counts/after_think": 46.5, + "token_counts/before_target": 1429.0, + "token_counts/before_think": 867.0 + }, + { + "avg_penalty/after_target": 3.367164731025696, + "avg_penalty/after_think": 3.7599384784698486, + "avg_penalty/before_target": 0.2106131799519062, + "avg_penalty/before_think": 0.5293397903442383, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 187.46875, + "completions/mean_terminated_length": 187.46875, + "completions/min_length": 68.75, + "completions/min_terminated_length": 68.75, + "epoch": 0.261, + "grad_norm": 3.369978427886963, + "kl": 12.109375, + "learning_rate": 1.8471219213821374e-05, + "loss": 1.0713, + "num_tokens": 19044758.0, + "reward": 1.6171875, + "reward_std": 0.7904754430055618, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.36720622703433037, + "step": 522, + "token_counts/after_target": 168.25, + "token_counts/after_think": 50.5, + "token_counts/before_target": 1946.0, + "token_counts/before_think": 834.75 + }, + { + "avg_penalty/after_target": 2.914408028125763, + "avg_penalty/after_think": 2.87209814786911, + "avg_penalty/before_target": 0.28829437866806984, + "avg_penalty/before_think": 0.5515342801809311, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.5, + "completions/max_terminated_length": 372.5, + "completions/mean_length": 216.03125, + "completions/mean_terminated_length": 216.03125, + "completions/min_length": 68.25, + "completions/min_terminated_length": 68.25, + "epoch": 0.2615, + "grad_norm": 5.464357852935791, + "kl": 16.90625, + "learning_rate": 1.8461931661275642e-05, + "loss": 1.3525, + "num_tokens": 19069400.0, + "reward": 1.41796875, + "reward_std": 0.8618432730436325, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.4692344516515732, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.4162106364965439, + "step": 523, + "token_counts/after_target": 501.5, + "token_counts/after_think": 40.25, + "token_counts/before_target": 2050.75, + "token_counts/before_think": 864.0 + }, + { + "avg_penalty/after_target": 2.4851199984550476, + "avg_penalty/after_think": 2.981696605682373, + "avg_penalty/before_target": 0.34441645070910454, + "avg_penalty/before_think": 0.4689314663410187, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.75, + "completions/max_terminated_length": 440.75, + "completions/mean_length": 237.109375, + "completions/mean_terminated_length": 237.109375, + "completions/min_length": 65.75, + "completions/min_terminated_length": 65.75, + "epoch": 0.262, + "grad_norm": 4.190430641174316, + "kl": 16.140625, + "learning_rate": 1.8452618332218563e-05, + "loss": 1.3036, + "num_tokens": 19094735.0, + "reward": 1.42578125, + "reward_std": 0.8935653418302536, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4625816270709038, + "rewards/tag_count_reward/mean": 0.72265625, + "rewards/tag_count_reward/std": 0.43806811422109604, + "step": 524, + "token_counts/after_target": 541.25, + "token_counts/after_think": 154.5, + "token_counts/before_target": 2129.75, + "token_counts/before_think": 968.25 + }, + { + "avg_penalty/after_target": 2.4127883911132812, + "avg_penalty/after_think": 3.641715943813324, + "avg_penalty/before_target": 0.2915750928223133, + "avg_penalty/before_think": 0.5935805663466454, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.25, + "completions/max_terminated_length": 391.25, + "completions/mean_length": 218.1875, + "completions/mean_terminated_length": 218.1875, + "completions/min_length": 75.25, + "completions/min_terminated_length": 75.25, + "epoch": 0.2625, + "grad_norm": 8.660957336425781, + "kl": 18.1875, + "learning_rate": 1.8443279255020153e-05, + "loss": 1.3081, + "num_tokens": 19120683.0, + "reward": 1.359375, + "reward_std": 0.8932158499956131, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.4876555874943733, + "rewards/tag_count_reward/mean": 0.71875, + "rewards/tag_count_reward/std": 0.4351571947336197, + "step": 525, + "token_counts/after_target": 389.0, + "token_counts/after_think": 98.0, + "token_counts/before_target": 2027.25, + "token_counts/before_think": 976.75 + }, + { + "avg_penalty/after_target": 2.4891073405742645, + "avg_penalty/after_think": 2.878388524055481, + "avg_penalty/before_target": 0.3175702281296253, + "avg_penalty/before_think": 0.5735750794410706, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.25, + "completions/max_terminated_length": 419.25, + "completions/mean_length": 216.546875, + "completions/mean_terminated_length": 216.546875, + "completions/min_length": 97.25, + "completions/min_terminated_length": 97.25, + "epoch": 0.263, + "grad_norm": 4.489537715911865, + "kl": 12.7841796875, + "learning_rate": 1.843391445812886e-05, + "loss": 1.1403, + "num_tokens": 19147694.0, + "reward": 1.53515625, + "reward_std": 0.6692199856042862, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.3538651168346405, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.32828672230243683, + "step": 526, + "token_counts/after_target": 499.75, + "token_counts/after_think": 97.5, + "token_counts/before_target": 1798.0, + "token_counts/before_think": 1069.5 + }, + { + "avg_penalty/after_target": 2.809255927801132, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.384750135242939, + "avg_penalty/before_think": 0.5351065993309021, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 630.5, + "completions/max_terminated_length": 630.5, + "completions/mean_length": 283.1875, + "completions/mean_terminated_length": 283.1875, + "completions/min_length": 75.5, + "completions/min_terminated_length": 75.5, + "epoch": 0.2635, + "grad_norm": 6.442636489868164, + "kl": 19.71875, + "learning_rate": 1.842452397007148e-05, + "loss": 1.6946, + "num_tokens": 19175562.0, + "reward": 1.16796875, + "reward_std": 0.8331545889377594, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.49297719448804855, + "rewards/tag_count_reward/mean": 0.66796875, + "rewards/tag_count_reward/std": 0.38790760189294815, + "step": 527, + "token_counts/after_target": 852.25, + "token_counts/after_think": 235.5, + "token_counts/before_target": 1834.0, + "token_counts/before_think": 1609.25 + }, + { + "avg_penalty/after_target": 2.4868399500846863, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3220125362277031, + "avg_penalty/before_think": 0.7124959826469421, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.5, + "completions/max_terminated_length": 498.5, + "completions/mean_length": 254.234375, + "completions/mean_terminated_length": 254.234375, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.264, + "grad_norm": 4.7795515060424805, + "kl": 14.25, + "learning_rate": 1.8415107819453065e-05, + "loss": 1.3292, + "num_tokens": 19200617.0, + "reward": 1.296875, + "reward_std": 0.790075957775116, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.5625, + "rewards/format_reward/std": 0.48935678601264954, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.3556142672896385, + "step": 528, + "token_counts/after_target": 622.0, + "token_counts/after_think": 235.0, + "token_counts/before_target": 1937.5, + "token_counts/before_think": 1273.25 + }, + { + "avg_penalty/after_target": 3.2004238963127136, + "avg_penalty/after_think": 2.7626805007457733, + "avg_penalty/before_target": 0.2837356738746166, + "avg_penalty/before_think": 0.47913452237844467, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 646.0, + "completions/max_terminated_length": 526.75, + "completions/mean_length": 267.34375, + "completions/mean_terminated_length": 255.28646087646484, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.2645, + "grad_norm": 7.701157093048096, + "kl": 19.375, + "learning_rate": 1.8405666034956842e-05, + "loss": 1.472, + "num_tokens": 19228703.0, + "reward": 1.30078125, + "reward_std": 0.8565758317708969, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.49297719448804855, + "rewards/tag_count_reward/mean": 0.70703125, + "rewards/tag_count_reward/std": 0.4012001305818558, + "step": 529, + "token_counts/after_target": 608.25, + "token_counts/after_think": 55.25, + "token_counts/before_target": 2139.0, + "token_counts/before_think": 1475.0 + }, + { + "avg_penalty/after_target": 2.370721220970154, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3741011694073677, + "avg_penalty/before_think": 0.46845434233546257, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.5, + "completions/max_terminated_length": 634.5, + "completions/mean_length": 299.203125, + "completions/mean_terminated_length": 299.203125, + "completions/min_length": 120.75, + "completions/min_terminated_length": 120.75, + "epoch": 0.265, + "grad_norm": 5.846811771392822, + "kl": 15.875, + "learning_rate": 1.8396198645344133e-05, + "loss": 1.4237, + "num_tokens": 19258332.0, + "reward": 1.33984375, + "reward_std": 0.8740151524543762, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.48866813629865646, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.4106734097003937, + "step": 530, + "token_counts/after_target": 767.0, + "token_counts/after_think": 182.0, + "token_counts/before_target": 1968.0, + "token_counts/before_think": 1870.25 + }, + { + "avg_penalty/after_target": 2.823250710964203, + "avg_penalty/after_think": 2.6371113657951355, + "avg_penalty/before_target": 0.3290640786290169, + "avg_penalty/before_think": 0.4709283113479614, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.5, + "completions/max_terminated_length": 454.5, + "completions/mean_length": 244.96875, + "completions/mean_terminated_length": 244.96875, + "completions/min_length": 62.25, + "completions/min_terminated_length": 62.25, + "epoch": 0.2655, + "grad_norm": 9.203191757202148, + "kl": 9.2265625, + "learning_rate": 1.8386705679454243e-05, + "loss": 0.9774, + "num_tokens": 19287402.0, + "reward": 1.625, + "reward_std": 0.9511198550462723, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.19159944355487823, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.47083858400583267, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.38720703125, + "step": 531, + "token_counts/after_target": 597.5, + "token_counts/after_think": 66.0, + "token_counts/before_target": 1616.5, + "token_counts/before_think": 1639.5 + }, + { + "avg_penalty/after_target": 2.390162467956543, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3931329846382141, + "avg_penalty/before_think": 0.5162726119160652, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.75, + "completions/max_terminated_length": 586.75, + "completions/mean_length": 248.046875, + "completions/mean_terminated_length": 248.046875, + "completions/min_length": 86.75, + "completions/min_terminated_length": 86.75, + "epoch": 0.266, + "grad_norm": 10.27932357788086, + "kl": 11.25, + "learning_rate": 1.837718716620439e-05, + "loss": 1.2355, + "num_tokens": 19315693.0, + "reward": 1.5, + "reward_std": 0.8209013342857361, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4515564441680908, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.3911737874150276, + "step": 532, + "token_counts/after_target": 589.75, + "token_counts/after_think": 87.25, + "token_counts/before_target": 2029.5, + "token_counts/before_think": 1262.25 + }, + { + "avg_penalty/after_target": 2.4393065571784973, + "avg_penalty/after_think": 3.9056991934776306, + "avg_penalty/before_target": 0.32501139491796494, + "avg_penalty/before_think": 0.5898133367300034, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.5, + "completions/max_terminated_length": 456.5, + "completions/mean_length": 222.21875, + "completions/mean_terminated_length": 222.21875, + "completions/min_length": 63.5, + "completions/min_terminated_length": 63.5, + "epoch": 0.2665, + "grad_norm": 4.738666534423828, + "kl": 8.2578125, + "learning_rate": 1.836764313458962e-05, + "loss": 0.9017, + "num_tokens": 19338475.0, + "reward": 1.75390625, + "reward_std": 0.8091063648462296, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.12909944355487823, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4075859263539314, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.3312305063009262, + "step": 533, + "token_counts/after_target": 481.25, + "token_counts/after_think": 55.5, + "token_counts/before_target": 2015.75, + "token_counts/before_think": 1003.0 + }, + { + "avg_penalty/after_target": 2.839103877544403, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3217874504625797, + "avg_penalty/before_think": 0.5379337891936302, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.5, + "completions/max_terminated_length": 435.5, + "completions/mean_length": 209.578125, + "completions/mean_terminated_length": 209.578125, + "completions/min_length": 98.25, + "completions/min_terminated_length": 98.25, + "epoch": 0.267, + "grad_norm": 5.744258880615234, + "kl": 8.56640625, + "learning_rate": 1.8358073613682705e-05, + "loss": 1.0056, + "num_tokens": 19361024.0, + "reward": 1.7265625, + "reward_std": 0.7165468037128448, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.3758598491549492, + "rewards/tag_count_reward/mean": 0.8671875, + "rewards/tag_count_reward/std": 0.3176135942339897, + "step": 534, + "token_counts/after_target": 378.0, + "token_counts/after_think": 90.5, + "token_counts/before_target": 1869.0, + "token_counts/before_think": 1015.75 + }, + { + "avg_penalty/after_target": 3.317491054534912, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.29860248416662216, + "avg_penalty/before_think": 0.3062901347875595, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.75, + "completions/max_terminated_length": 435.75, + "completions/mean_length": 187.09375, + "completions/mean_terminated_length": 187.09375, + "completions/min_length": 80.25, + "completions/min_terminated_length": 80.25, + "epoch": 0.2675, + "grad_norm": 6.577942848205566, + "kl": 14.59375, + "learning_rate": 1.8348478632634067e-05, + "loss": 1.4661, + "num_tokens": 19384022.0, + "reward": 1.65625, + "reward_std": 0.7160251289606094, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3837348371744156, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.34271082282066345, + "step": 535, + "token_counts/after_target": 479.0, + "token_counts/after_think": 69.0, + "token_counts/before_target": 1624.25, + "token_counts/before_think": 821.25 + }, + { + "avg_penalty/after_target": 2.8792774081230164, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3706427291035652, + "avg_penalty/before_think": 0.49811962991952896, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 582.25, + "completions/max_terminated_length": 450.5, + "completions/mean_length": 230.34375, + "completions/mean_terminated_length": 218.42813110351562, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.268, + "grad_norm": 8.023609161376953, + "kl": 18.3203125, + "learning_rate": 1.8338858220671683e-05, + "loss": 1.5006, + "num_tokens": 19408364.0, + "reward": 1.640625, + "reward_std": 0.6714828908443451, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.39476002007722855, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.28637801110744476, + "step": 536, + "token_counts/after_target": 665.0, + "token_counts/after_think": 63.0, + "token_counts/before_target": 1657.75, + "token_counts/before_think": 1299.75 + }, + { + "avg_penalty/after_target": 3.0335007309913635, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.2992328219115734, + "avg_penalty/before_think": 0.32549911737442017, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 183.890625, + "completions/mean_terminated_length": 183.890625, + "completions/min_length": 78.75, + "completions/min_terminated_length": 78.75, + "epoch": 0.2685, + "grad_norm": 4.252342700958252, + "kl": 15.1171875, + "learning_rate": 1.8329212407100996e-05, + "loss": 1.2643, + "num_tokens": 19431781.0, + "reward": 1.69140625, + "reward_std": 0.6399202793836594, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.39123913645744324, + "rewards/tag_count_reward/mean": 0.87890625, + "rewards/tag_count_reward/std": 0.26406651362776756, + "step": 537, + "token_counts/after_target": 410.0, + "token_counts/after_think": 59.25, + "token_counts/before_target": 1541.25, + "token_counts/before_think": 931.75 + }, + { + "avg_penalty/after_target": 2.2775823771953583, + "avg_penalty/after_think": 3.9263981580734253, + "avg_penalty/before_target": 0.5978605002164841, + "avg_penalty/before_think": 0.44804053753614426, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 698.75, + "completions/max_terminated_length": 537.5, + "completions/mean_length": 240.546875, + "completions/mean_terminated_length": 214.88616180419922, + "completions/min_length": 30.5, + "completions/min_terminated_length": 30.5, + "epoch": 0.269, + "grad_norm": 19.216190338134766, + "kl": 34.75, + "learning_rate": 1.8319541221304825e-05, + "loss": 2.5348, + "num_tokens": 19457016.0, + "reward": 1.453125, + "reward_std": 0.8718871474266052, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4682852029800415, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.41379454731941223, + "step": 538, + "token_counts/after_target": 1126.25, + "token_counts/after_think": 111.5, + "token_counts/before_target": 1657.0, + "token_counts/before_think": 954.0 + }, + { + "avg_penalty/after_target": 2.553610771894455, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3923381641507149, + "avg_penalty/before_think": 0.3964984640479088, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 694.5, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 264.421875, + "completions/mean_terminated_length": 252.07291793823242, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.2695, + "grad_norm": 43.152549743652344, + "kl": 48.09375, + "learning_rate": 1.8309844692743283e-05, + "loss": 2.8677, + "num_tokens": 19484131.0, + "reward": 1.2109375, + "reward_std": 0.9710146486759186, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.49500229209661484, + "rewards/tag_count_reward/mean": 0.6171875, + "rewards/tag_count_reward/std": 0.482691690325737, + "step": 539, + "token_counts/after_target": 911.0, + "token_counts/after_think": 56.75, + "token_counts/before_target": 2593.25, + "token_counts/before_think": 669.75 + }, + { + "avg_penalty/after_target": 2.532360076904297, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.525403156876564, + "avg_penalty/before_think": 0.3197627030313015, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 833.75, + "completions/max_terminated_length": 566.75, + "completions/mean_length": 224.671875, + "completions/mean_terminated_length": 198.47813034057617, + "completions/min_length": 59.75, + "completions/min_terminated_length": 59.75, + "epoch": 0.27, + "grad_norm": 21.42049789428711, + "kl": 43.0625, + "learning_rate": 1.8300122850953678e-05, + "loss": 3.1731, + "num_tokens": 19507118.0, + "reward": 1.515625, + "reward_std": 0.824204221367836, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44938503205776215, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.38437695801258087, + "step": 540, + "token_counts/after_target": 1069.25, + "token_counts/after_think": 39.0, + "token_counts/before_target": 1686.75, + "token_counts/before_think": 799.75 + }, + { + "avg_penalty/after_target": 2.4922515749931335, + "avg_penalty/after_think": 2.445293515920639, + "avg_penalty/before_target": 0.4285813122987747, + "avg_penalty/before_think": 0.31722525507211685, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 647.25, + "completions/max_terminated_length": 606.25, + "completions/mean_length": 222.09375, + "completions/mean_terminated_length": 209.11458587646484, + "completions/min_length": 65.75, + "completions/min_terminated_length": 65.75, + "epoch": 0.2705, + "grad_norm": 19.554357528686523, + "kl": 35.890625, + "learning_rate": 1.8290375725550417e-05, + "loss": 2.4753, + "num_tokens": 19530708.0, + "reward": 1.44921875, + "reward_std": 0.8164798319339752, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4581565484404564, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.38308315724134445, + "step": 541, + "token_counts/after_target": 637.25, + "token_counts/after_think": 54.75, + "token_counts/before_target": 1893.5, + "token_counts/before_think": 968.0 + }, + { + "avg_penalty/after_target": 2.875090479850769, + "avg_penalty/after_think": 1.515740156173706, + "avg_penalty/before_target": 0.3429255485534668, + "avg_penalty/before_think": 0.35725265368819237, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 725.0, + "completions/max_terminated_length": 607.5, + "completions/mean_length": 187.484375, + "completions/mean_terminated_length": 174.08854293823242, + "completions/min_length": 56.5, + "completions/min_terminated_length": 56.5, + "epoch": 0.271, + "grad_norm": 5.495570659637451, + "kl": 30.90625, + "learning_rate": 1.8280603346224945e-05, + "loss": 2.6484, + "num_tokens": 19554419.0, + "reward": 1.6015625, + "reward_std": 0.79993736743927, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4097762927412987, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.3923732340335846, + "step": 542, + "token_counts/after_target": 792.5, + "token_counts/after_think": 21.25, + "token_counts/before_target": 1444.75, + "token_counts/before_think": 741.25 + }, + { + "avg_penalty/after_target": 2.3627891540527344, + "avg_penalty/after_think": 2.9221673607826233, + "avg_penalty/before_target": 0.6114752814173698, + "avg_penalty/before_think": 0.2983150780200958, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 789.25, + "completions/max_terminated_length": 676.75, + "completions/mean_length": 238.796875, + "completions/mean_terminated_length": 214.18080520629883, + "completions/min_length": 65.75, + "completions/min_terminated_length": 65.75, + "epoch": 0.2715, + "grad_norm": 5.446610927581787, + "kl": 35.28125, + "learning_rate": 1.827080574274562e-05, + "loss": 2.8, + "num_tokens": 19579686.0, + "reward": 1.54296875, + "reward_std": 0.800984725356102, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4414467439055443, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.35398315638303757, + "step": 543, + "token_counts/after_target": 1069.75, + "token_counts/after_think": 40.0, + "token_counts/before_target": 1845.0, + "token_counts/before_think": 866.0 + }, + { + "avg_penalty/after_target": 3.270762085914612, + "avg_penalty/after_think": 2.859087824821472, + "avg_penalty/before_target": 0.32330407574772835, + "avg_penalty/before_think": 0.3794201985001564, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.75, + "completions/max_terminated_length": 572.75, + "completions/mean_length": 219.90625, + "completions/mean_terminated_length": 219.90625, + "completions/min_length": 85.75, + "completions/min_terminated_length": 85.75, + "epoch": 0.272, + "grad_norm": 3.196072816848755, + "kl": 23.96875, + "learning_rate": 1.8260982944957638e-05, + "loss": 2.0137, + "num_tokens": 19604176.0, + "reward": 1.58203125, + "reward_std": 0.8153580129146576, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.42516325414180756, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.3917454853653908, + "step": 544, + "token_counts/after_target": 492.0, + "token_counts/after_think": 57.5, + "token_counts/before_target": 1932.25, + "token_counts/before_think": 1036.75 + }, + { + "avg_penalty/after_target": 2.6535776257514954, + "avg_penalty/after_think": 3.4580448269844055, + "avg_penalty/before_target": 0.2905823178589344, + "avg_penalty/before_think": 0.3765386939048767, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.75, + "completions/max_terminated_length": 493.75, + "completions/mean_length": 165.25, + "completions/mean_terminated_length": 165.25, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.2725, + "grad_norm": 7.754828929901123, + "kl": 16.453125, + "learning_rate": 1.8251134982782952e-05, + "loss": 1.6525, + "num_tokens": 19625440.0, + "reward": 1.66796875, + "reward_std": 0.7478873580694199, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38772592693567276, + "rewards/tag_count_reward/mean": 0.83984375, + "rewards/tag_count_reward/std": 0.3628596141934395, + "step": 545, + "token_counts/after_target": 339.0, + "token_counts/after_think": 78.5, + "token_counts/before_target": 1300.75, + "token_counts/before_think": 925.75 + }, + { + "avg_penalty/after_target": 3.263533055782318, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.29419534280896187, + "avg_penalty/before_think": 0.27668987214565277, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 144.03125, + "completions/mean_terminated_length": 144.03125, + "completions/min_length": 50.5, + "completions/min_terminated_length": 50.5, + "epoch": 0.273, + "grad_norm": 15.58311939239502, + "kl": 10.5859375, + "learning_rate": 1.8241261886220155e-05, + "loss": 1.3254, + "num_tokens": 19647058.0, + "reward": 1.75390625, + "reward_std": 0.6451635807752609, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3265564441680908, + "rewards/tag_count_reward/mean": 0.87890625, + "rewards/tag_count_reward/std": 0.3190931901335716, + "step": 546, + "token_counts/after_target": 303.5, + "token_counts/after_think": 47.0, + "token_counts/before_target": 999.0, + "token_counts/before_think": 955.0 + }, + { + "avg_penalty/after_target": 2.106818437576294, + "avg_penalty/after_think": 3.7712961435317993, + "avg_penalty/before_target": 0.35349610820412636, + "avg_penalty/before_think": 0.33046162873506546, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 634.25, + "completions/max_terminated_length": 442.75, + "completions/mean_length": 185.875, + "completions/mean_terminated_length": 171.65729331970215, + "completions/min_length": 59.5, + "completions/min_terminated_length": 59.5, + "epoch": 0.2735, + "grad_norm": 9.405563354492188, + "kl": 18.390625, + "learning_rate": 1.8231363685344422e-05, + "loss": 1.8852, + "num_tokens": 19671898.0, + "reward": 1.6640625, + "reward_std": 0.7203686088323593, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.3683478757739067, + "rewards/tag_count_reward/mean": 0.8359375, + "rewards/tag_count_reward/std": 0.3525788486003876, + "step": 547, + "token_counts/after_target": 420.5, + "token_counts/after_think": 74.5, + "token_counts/before_target": 1480.25, + "token_counts/before_think": 998.75 + }, + { + "avg_penalty/after_target": 2.805997848510742, + "avg_penalty/after_think": 2.8475258350372314, + "avg_penalty/before_target": 0.40413424372673035, + "avg_penalty/before_think": 0.4228288419544697, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.75, + "completions/max_terminated_length": 471.75, + "completions/mean_length": 188.5, + "completions/mean_terminated_length": 188.5, + "completions/min_length": 72.75, + "completions/min_terminated_length": 72.75, + "epoch": 0.274, + "grad_norm": 6.12847375869751, + "kl": 15.96875, + "learning_rate": 1.8221440410307375e-05, + "loss": 1.5655, + "num_tokens": 19692538.0, + "reward": 1.65625, + "reward_std": 0.7277554571628571, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3943893313407898, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.34144217520952225, + "step": 548, + "token_counts/after_target": 616.75, + "token_counts/after_think": 18.25, + "token_counts/before_target": 1285.25, + "token_counts/before_think": 1095.75 + }, + { + "avg_penalty/after_target": 3.0048457086086273, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.2354198731482029, + "avg_penalty/before_think": 0.36871908605098724, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 141.515625, + "completions/mean_terminated_length": 141.515625, + "completions/min_length": 46.25, + "completions/min_terminated_length": 46.25, + "epoch": 0.2745, + "grad_norm": 7.512722969055176, + "kl": 14.2890625, + "learning_rate": 1.821149209133704e-05, + "loss": 1.0994, + "num_tokens": 19711611.0, + "reward": 1.59375, + "reward_std": 0.813252180814743, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.4066260978579521, + "step": 549, + "token_counts/after_target": 248.25, + "token_counts/after_think": 39.75, + "token_counts/before_target": 1140.75, + "token_counts/before_think": 835.5 + }, + { + "avg_penalty/after_target": 2.97097384929657, + "avg_penalty/after_think": 2.369953900575638, + "avg_penalty/before_target": 0.22509362548589706, + "avg_penalty/before_think": 0.4108431190252304, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.75, + "completions/max_terminated_length": 310.75, + "completions/mean_length": 135.9375, + "completions/mean_terminated_length": 135.9375, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.275, + "grad_norm": 15.277560234069824, + "kl": 26.34375, + "learning_rate": 1.8201518758737726e-05, + "loss": 1.7538, + "num_tokens": 19730247.0, + "reward": 1.42578125, + "reward_std": 0.9129554033279419, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4682852029800415, + "rewards/tag_count_reward/mean": 0.72265625, + "rewards/tag_count_reward/std": 0.45119059830904007, + "step": 550, + "token_counts/after_target": 313.0, + "token_counts/after_think": 45.25, + "token_counts/before_target": 1148.75, + "token_counts/before_think": 668.0 + }, + { + "avg_penalty/after_target": 3.1566600799560547, + "avg_penalty/after_think": 2.748973309993744, + "avg_penalty/before_target": 0.23695765808224678, + "avg_penalty/before_think": 0.3699520416557789, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.25, + "completions/max_terminated_length": 369.25, + "completions/mean_length": 146.25, + "completions/mean_terminated_length": 146.25, + "completions/min_length": 39.5, + "completions/min_terminated_length": 39.5, + "epoch": 0.2755, + "grad_norm": 11.795960426330566, + "kl": 20.05078125, + "learning_rate": 1.819152044288992e-05, + "loss": 1.4686, + "num_tokens": 19752487.0, + "reward": 1.51171875, + "reward_std": 0.830070286989212, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.42078252136707306, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.41374868899583817, + "step": 551, + "token_counts/after_target": 366.25, + "token_counts/after_think": 63.0, + "token_counts/before_target": 1245.75, + "token_counts/before_think": 665.0 + }, + { + "avg_penalty/after_target": 3.059540271759033, + "avg_penalty/after_think": 2.883176028728485, + "avg_penalty/before_target": 0.2611643373966217, + "avg_penalty/before_think": 0.4717693254351616, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.5, + "completions/max_terminated_length": 278.5, + "completions/mean_length": 140.09375, + "completions/mean_terminated_length": 140.09375, + "completions/min_length": 54.75, + "completions/min_terminated_length": 54.75, + "epoch": 0.276, + "grad_norm": 8.318824768066406, + "kl": 16.0419921875, + "learning_rate": 1.8181497174250236e-05, + "loss": 1.3395, + "num_tokens": 19771293.0, + "reward": 1.66015625, + "reward_std": 0.6776643842458725, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.37937305867671967, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.30871395021677017, + "step": 552, + "token_counts/after_target": 253.25, + "token_counts/after_think": 50.25, + "token_counts/before_target": 1155.5, + "token_counts/before_think": 782.5 + }, + { + "avg_penalty/after_target": 3.4806524515151978, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.33399318531155586, + "avg_penalty/before_think": 0.3329496309161186, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.75, + "completions/max_terminated_length": 406.75, + "completions/mean_length": 165.921875, + "completions/mean_terminated_length": 165.921875, + "completions/min_length": 59.5, + "completions/min_terminated_length": 59.5, + "epoch": 0.2765, + "grad_norm": 4.575151443481445, + "kl": 24.078125, + "learning_rate": 1.8171448983351284e-05, + "loss": 1.9833, + "num_tokens": 19792008.0, + "reward": 1.62109375, + "reward_std": 0.9177558124065399, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.11967839300632477, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.43303824216127396, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.4142191931605339, + "step": 553, + "token_counts/after_target": 516.75, + "token_counts/after_think": 42.0, + "token_counts/before_target": 1397.75, + "token_counts/before_think": 698.25 + }, + { + "avg_penalty/after_target": 2.0573927760124207, + "avg_penalty/after_think": 3.691168785095215, + "avg_penalty/before_target": 0.3729807920753956, + "avg_penalty/before_think": 0.3787098750472069, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.5, + "completions/max_terminated_length": 514.5, + "completions/mean_length": 174.5625, + "completions/mean_terminated_length": 174.5625, + "completions/min_length": 64.75, + "completions/min_terminated_length": 64.75, + "epoch": 0.277, + "grad_norm": 2.37375545501709, + "kl": 17.859375, + "learning_rate": 1.8161375900801603e-05, + "loss": 1.5319, + "num_tokens": 19814332.0, + "reward": 1.66015625, + "reward_std": 0.7437703758478165, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.3758598491549492, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.36839659512043, + "step": 554, + "token_counts/after_target": 240.0, + "token_counts/after_think": 81.0, + "token_counts/before_target": 1564.0, + "token_counts/before_think": 908.0 + }, + { + "avg_penalty/after_target": 3.3060975074768066, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.37541133910417557, + "avg_penalty/before_think": 0.4923620820045471, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.5, + "completions/max_terminated_length": 695.5, + "completions/mean_length": 236.28125, + "completions/mean_terminated_length": 236.28125, + "completions/min_length": 56.75, + "completions/min_terminated_length": 56.75, + "epoch": 0.2775, + "grad_norm": 8.889055252075195, + "kl": 15.921875, + "learning_rate": 1.815127795728554e-05, + "loss": 1.7007, + "num_tokens": 19839470.0, + "reward": 1.58984375, + "reward_std": 0.7815525531768799, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.43303824216127396, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.3734128326177597, + "step": 555, + "token_counts/after_target": 886.5, + "token_counts/after_think": 73.75, + "token_counts/before_target": 1837.0, + "token_counts/before_think": 983.25 + }, + { + "avg_penalty/after_target": 2.7055185735225677, + "avg_penalty/after_think": 3.975633203983307, + "avg_penalty/before_target": 0.43039555847644806, + "avg_penalty/before_think": 0.5186482295393944, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 536.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 230.6875, + "completions/mean_terminated_length": 206.11830520629883, + "completions/min_length": 82.5, + "completions/min_terminated_length": 82.5, + "epoch": 0.278, + "grad_norm": 9.209707260131836, + "kl": 11.326171875, + "learning_rate": 1.8141155183563195e-05, + "loss": 1.3222, + "num_tokens": 19864074.0, + "reward": 1.4296875, + "reward_std": 0.763045147061348, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.49345622956752777, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.3118654564023018, + "step": 556, + "token_counts/after_target": 663.5, + "token_counts/after_think": 84.75, + "token_counts/before_target": 2024.25, + "token_counts/before_think": 918.5 + }, + { + "avg_penalty/after_target": 2.855454385280609, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.5034281983971596, + "avg_penalty/before_think": 0.667097382247448, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 689.75, + "completions/max_terminated_length": 689.75, + "completions/mean_length": 374.515625, + "completions/mean_terminated_length": 374.515625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.2785, + "grad_norm": 4.3819355964660645, + "kl": 19.71875, + "learning_rate": 1.8131007610470278e-05, + "loss": 1.7118, + "num_tokens": 19899451.0, + "reward": 1.0390625, + "reward_std": 0.7057588696479797, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4797805994749069, + "rewards/tag_count_reward/mean": 0.6953125, + "rewards/tag_count_reward/std": 0.3028506450355053, + "step": 557, + "token_counts/after_target": 1936.5, + "token_counts/after_think": 39.25, + "token_counts/before_target": 2965.25, + "token_counts/before_think": 1051.25 + }, + { + "avg_penalty/after_target": 2.2028941810131073, + "avg_penalty/after_think": 2.825219690799713, + "avg_penalty/before_target": 0.4855658933520317, + "avg_penalty/before_think": 0.7256250977516174, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 744.75, + "completions/max_terminated_length": 694.5, + "completions/mean_length": 377.90625, + "completions/mean_terminated_length": 368.2468795776367, + "completions/min_length": 138.75, + "completions/min_terminated_length": 138.75, + "epoch": 0.279, + "grad_norm": 10.374789237976074, + "kl": 19.1875, + "learning_rate": 1.8120835268918063e-05, + "loss": 1.4668, + "num_tokens": 19938405.0, + "reward": 1.16015625, + "reward_std": 0.7293576151132584, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.390625, + "rewards/format_reward/std": 0.4939897432923317, + "rewards/tag_count_reward/mean": 0.72265625, + "rewards/tag_count_reward/std": 0.28158410638570786, + "step": 558, + "token_counts/after_target": 1543.5, + "token_counts/after_think": 38.0, + "token_counts/before_target": 3257.25, + "token_counts/before_think": 1207.75 + }, + { + "avg_penalty/after_target": 3.228468418121338, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.423709437251091, + "avg_penalty/before_think": 0.731164276599884, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 951.5, + "completions/max_terminated_length": 847.25, + "completions/mean_length": 409.703125, + "completions/mean_terminated_length": 390.76146697998047, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.2795, + "grad_norm": 9.704721450805664, + "kl": 26.75, + "learning_rate": 1.8110638189893267e-05, + "loss": 2.1092, + "num_tokens": 19973570.0, + "reward": 0.875, + "reward_std": 0.6386693939566612, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4009781554341316, + "rewards/tag_count_reward/mean": 0.625, + "rewards/tag_count_reward/std": 0.3310485929250717, + "step": 559, + "token_counts/after_target": 1799.5, + "token_counts/after_think": 73.0, + "token_counts/before_target": 2791.25, + "token_counts/before_think": 1891.5 + }, + { + "avg_penalty/after_target": 2.089093714952469, + "avg_penalty/after_think": 3.858728766441345, + "avg_penalty/before_target": 0.3793264254927635, + "avg_penalty/before_think": 0.7237208634614944, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.5, + "completions/max_terminated_length": 586.5, + "completions/mean_length": 308.5625, + "completions/mean_terminated_length": 308.5625, + "completions/min_length": 123.5, + "completions/min_terminated_length": 123.5, + "epoch": 0.28, + "grad_norm": 3.024369716644287, + "kl": 15.765625, + "learning_rate": 1.8100416404457962e-05, + "loss": 1.3015, + "num_tokens": 20003174.0, + "reward": 1.15234375, + "reward_std": 0.7887454032897949, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.453125, + "rewards/format_reward/std": 0.5071863383054733, + "rewards/tag_count_reward/mean": 0.69921875, + "rewards/tag_count_reward/std": 0.3708477318286896, + "step": 560, + "token_counts/after_target": 904.5, + "token_counts/after_think": 122.5, + "token_counts/before_target": 2804.25, + "token_counts/before_think": 1105.75 + }, + { + "avg_penalty/after_target": 2.3878974318504333, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3872477188706398, + "avg_penalty/before_think": 0.4679938778281212, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.25, + "completions/max_terminated_length": 419.25, + "completions/mean_length": 242.296875, + "completions/mean_terminated_length": 242.296875, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.2805, + "grad_norm": 10.023555755615234, + "kl": 9.96875, + "learning_rate": 1.8090169943749477e-05, + "loss": 1.0758, + "num_tokens": 20029225.0, + "reward": 1.28515625, + "reward_std": 0.838862270116806, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.4939897432923317, + "rewards/tag_count_reward/mean": 0.70703125, + "rewards/tag_count_reward/std": 0.39557627588510513, + "step": 561, + "token_counts/after_target": 658.5, + "token_counts/after_think": 48.75, + "token_counts/before_target": 1928.5, + "token_counts/before_think": 1241.0 + }, + { + "avg_penalty/after_target": 2.1437041461467743, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.41088584810495377, + "avg_penalty/before_think": 0.5208277106285095, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 753.25, + "completions/max_terminated_length": 753.25, + "completions/mean_length": 292.546875, + "completions/mean_terminated_length": 292.546875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.281, + "grad_norm": 15.177724838256836, + "kl": 10.1875, + "learning_rate": 1.8079898838980304e-05, + "loss": 1.3496, + "num_tokens": 20058716.0, + "reward": 1.4375, + "reward_std": 0.8206945955753326, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.4581565484404564, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.3805573433637619, + "step": 562, + "token_counts/after_target": 754.5, + "token_counts/after_think": 157.5, + "token_counts/before_target": 2311.0, + "token_counts/before_think": 1457.75 + }, + { + "avg_penalty/after_target": 2.3043355643749237, + "avg_penalty/after_think": 3.446878343820572, + "avg_penalty/before_target": 0.38051285594701767, + "avg_penalty/before_think": 0.6701487898826599, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.75, + "completions/max_terminated_length": 647.75, + "completions/mean_length": 264.75, + "completions/mean_terminated_length": 264.75, + "completions/min_length": 73.75, + "completions/min_terminated_length": 73.75, + "epoch": 0.2815, + "grad_norm": 11.33957576751709, + "kl": 5.9140625, + "learning_rate": 1.806960312143802e-05, + "loss": 0.9298, + "num_tokens": 20088540.0, + "reward": 1.48046875, + "reward_std": 0.7922268360853195, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4682852029800415, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.3467739447951317, + "step": 563, + "token_counts/after_target": 594.0, + "token_counts/after_think": 167.5, + "token_counts/before_target": 2234.0, + "token_counts/before_think": 1240.5 + }, + { + "avg_penalty/after_target": 2.2085163295269012, + "avg_penalty/after_think": 3.872118592262268, + "avg_penalty/before_target": 0.2912307046353817, + "avg_penalty/before_think": 0.48776212334632874, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 223.765625, + "completions/mean_terminated_length": 223.765625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.282, + "grad_norm": 6.27140998840332, + "kl": 8.15625, + "learning_rate": 1.805928282248516e-05, + "loss": 0.9225, + "num_tokens": 20112189.0, + "reward": 1.4453125, + "reward_std": 0.8203685134649277, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.48148179799318314, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.3793427348136902, + "step": 564, + "token_counts/after_target": 323.75, + "token_counts/after_think": 177.5, + "token_counts/before_target": 1807.25, + "token_counts/before_think": 1271.75 + }, + { + "avg_penalty/after_target": 2.6668658554553986, + "avg_penalty/after_think": 3.972080111503601, + "avg_penalty/before_target": 0.3085991442203522, + "avg_penalty/before_think": 0.6665142625570297, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.25, + "completions/max_terminated_length": 515.25, + "completions/mean_length": 283.734375, + "completions/mean_terminated_length": 283.734375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.2825, + "grad_norm": 4.701760768890381, + "kl": 12.546875, + "learning_rate": 1.804893797355914e-05, + "loss": 1.2583, + "num_tokens": 20142556.0, + "reward": 1.30859375, + "reward_std": 0.8975925147533417, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.49654312431812286, + "rewards/tag_count_reward/mean": 0.69921875, + "rewards/tag_count_reward/std": 0.4162556156516075, + "step": 565, + "token_counts/after_target": 843.5, + "token_counts/after_think": 80.25, + "token_counts/before_target": 2309.25, + "token_counts/before_think": 1306.75 + }, + { + "avg_penalty/after_target": 1.9258290231227875, + "avg_penalty/after_think": 1.9627036452293396, + "avg_penalty/before_target": 0.38352052867412567, + "avg_penalty/before_think": 0.43686895817518234, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.5, + "completions/max_terminated_length": 506.5, + "completions/mean_length": 272.453125, + "completions/mean_terminated_length": 272.453125, + "completions/min_length": 94.75, + "completions/min_terminated_length": 94.75, + "epoch": 0.283, + "grad_norm": 7.162880897521973, + "kl": 16.4375, + "learning_rate": 1.8038568606172172e-05, + "loss": 1.2934, + "num_tokens": 20171385.0, + "reward": 1.265625, + "reward_std": 0.8999509662389755, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.5071863383054733, + "rewards/tag_count_reward/mean": 0.6875, + "rewards/tag_count_reward/std": 0.4316461607813835, + "step": 566, + "token_counts/after_target": 706.5, + "token_counts/after_think": 174.0, + "token_counts/before_target": 2041.75, + "token_counts/before_think": 1437.0 + }, + { + "avg_penalty/after_target": 2.386529862880707, + "avg_penalty/after_think": 3.524095296859741, + "avg_penalty/before_target": 0.3975303992629051, + "avg_penalty/before_think": 0.6657714694738388, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.0, + "completions/max_terminated_length": 569.0, + "completions/mean_length": 287.34375, + "completions/mean_terminated_length": 287.34375, + "completions/min_length": 53.25, + "completions/min_terminated_length": 53.25, + "epoch": 0.2835, + "grad_norm": 12.254944801330566, + "kl": 26.4375, + "learning_rate": 1.8028174751911147e-05, + "loss": 1.9654, + "num_tokens": 20202303.0, + "reward": 0.94140625, + "reward_std": 0.8777918070554733, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.4704566150903702, + "rewards/tag_count_reward/mean": 0.55078125, + "rewards/tag_count_reward/std": 0.44103386253118515, + "step": 567, + "token_counts/after_target": 1020.0, + "token_counts/after_think": 142.0, + "token_counts/before_target": 2518.0, + "token_counts/before_think": 917.5 + }, + { + "avg_penalty/after_target": 2.6850362420082092, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.39817171543836594, + "avg_penalty/before_think": 0.4464295133948326, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.5, + "completions/max_terminated_length": 514.5, + "completions/mean_length": 245.359375, + "completions/mean_terminated_length": 245.359375, + "completions/min_length": 41.5, + "completions/min_terminated_length": 41.5, + "epoch": 0.284, + "grad_norm": 7.670081615447998, + "kl": 21.34375, + "learning_rate": 1.801775644243754e-05, + "loss": 1.6754, + "num_tokens": 20227574.0, + "reward": 1.21875, + "reward_std": 0.870573028922081, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.5049516260623932, + "rewards/tag_count_reward/mean": 0.671875, + "rewards/tag_count_reward/std": 0.4121009409427643, + "step": 568, + "token_counts/after_target": 778.5, + "token_counts/after_think": 95.5, + "token_counts/before_target": 1856.75, + "token_counts/before_think": 1195.0 + }, + { + "avg_penalty/after_target": 2.2269217371940613, + "avg_penalty/after_think": 3.447167456150055, + "avg_penalty/before_target": 0.4949027970433235, + "avg_penalty/before_think": 0.5449561476707458, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 665.5, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 300.15625, + "completions/mean_terminated_length": 288.91770935058594, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.2845, + "grad_norm": 9.717049598693848, + "kl": 26.1875, + "learning_rate": 1.8007313709487334e-05, + "loss": 2.025, + "num_tokens": 20258096.0, + "reward": 1.10546875, + "reward_std": 0.9420141279697418, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5143726766109467, + "rewards/tag_count_reward/mean": 0.60546875, + "rewards/tag_count_reward/std": 0.4594031572341919, + "step": 569, + "token_counts/after_target": 1236.5, + "token_counts/after_think": 44.75, + "token_counts/before_target": 2270.0, + "token_counts/before_think": 1251.25 + }, + { + "avg_penalty/after_target": 3.4547579288482666, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.24608278274536133, + "avg_penalty/before_think": 0.4817383550107479, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.5, + "completions/max_terminated_length": 430.5, + "completions/mean_length": 223.9375, + "completions/mean_terminated_length": 223.9375, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.285, + "grad_norm": 3.391957998275757, + "kl": 17.640625, + "learning_rate": 1.799684658487091e-05, + "loss": 1.4813, + "num_tokens": 20282540.0, + "reward": 1.25390625, + "reward_std": 0.9130164533853531, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.5049516260623932, + "rewards/tag_count_reward/mean": 0.67578125, + "rewards/tag_count_reward/std": 0.4403747096657753, + "step": 570, + "token_counts/after_target": 582.75, + "token_counts/after_think": 45.5, + "token_counts/before_target": 1991.5, + "token_counts/before_think": 963.25 + }, + { + "avg_penalty/after_target": 3.4493037462234497, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.35710907727479935, + "avg_penalty/before_think": 0.27397190406918526, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 588.5, + "completions/max_terminated_length": 588.5, + "completions/mean_length": 234.0, + "completions/mean_terminated_length": 234.0, + "completions/min_length": 94.5, + "completions/min_terminated_length": 94.5, + "epoch": 0.2855, + "grad_norm": 17.744049072265625, + "kl": 15.28125, + "learning_rate": 1.798635510047293e-05, + "loss": 1.7098, + "num_tokens": 20306668.0, + "reward": 1.51171875, + "reward_std": 0.8322036117315292, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44187305867671967, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.40216289460659027, + "step": 571, + "token_counts/after_target": 902.5, + "token_counts/after_think": 60.0, + "token_counts/before_target": 1773.75, + "token_counts/before_think": 1007.75 + }, + { + "avg_penalty/after_target": 3.0111944675445557, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.32263896614313126, + "avg_penalty/before_think": 0.409181360155344, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 570.25, + "completions/max_terminated_length": 570.25, + "completions/mean_length": 215.359375, + "completions/mean_terminated_length": 215.359375, + "completions/min_length": 88.25, + "completions/min_terminated_length": 88.25, + "epoch": 0.286, + "grad_norm": 6.389527320861816, + "kl": 19.8125, + "learning_rate": 1.797583928825229e-05, + "loss": 1.7807, + "num_tokens": 20329011.0, + "reward": 1.359375, + "reward_std": 0.9288264811038971, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.479247085750103, + "rewards/tag_count_reward/mean": 0.6875, + "rewards/tag_count_reward/std": 0.4532415196299553, + "step": 572, + "token_counts/after_target": 465.5, + "token_counts/after_think": 40.5, + "token_counts/before_target": 2205.0, + "token_counts/before_think": 734.75 + }, + { + "avg_penalty/after_target": 2.122159719467163, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4399770274758339, + "avg_penalty/before_think": 0.3559975251555443, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 676.5, + "completions/max_terminated_length": 676.5, + "completions/mean_length": 245.0625, + "completions/mean_terminated_length": 245.0625, + "completions/min_length": 69.75, + "completions/min_terminated_length": 69.75, + "epoch": 0.2865, + "grad_norm": 3.0889008045196533, + "kl": 23.625, + "learning_rate": 1.7965299180241963e-05, + "loss": 1.9062, + "num_tokens": 20357991.0, + "reward": 1.1953125, + "reward_std": 0.9455913454294205, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.48989029973745346, + "rewards/tag_count_reward/mean": 0.6171875, + "rewards/tag_count_reward/std": 0.469953715801239, + "step": 573, + "token_counts/after_target": 730.75, + "token_counts/after_think": 66.0, + "token_counts/before_target": 2354.0, + "token_counts/before_think": 770.25 + }, + { + "avg_penalty/after_target": 3.150827705860138, + "avg_penalty/after_think": 3.924127221107483, + "avg_penalty/before_target": 0.24378632381558418, + "avg_penalty/before_think": 0.5834177136421204, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.25, + "completions/max_terminated_length": 518.25, + "completions/mean_length": 230.203125, + "completions/mean_terminated_length": 230.203125, + "completions/min_length": 88.25, + "completions/min_terminated_length": 88.25, + "epoch": 0.287, + "grad_norm": 2.767982006072998, + "kl": 19.15625, + "learning_rate": 1.795473480854896e-05, + "loss": 1.6914, + "num_tokens": 20383620.0, + "reward": 1.36328125, + "reward_std": 0.9059168249368668, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.47663040459156036, + "rewards/tag_count_reward/mean": 0.70703125, + "rewards/tag_count_reward/std": 0.4460761621594429, + "step": 574, + "token_counts/after_target": 508.0, + "token_counts/after_think": 140.75, + "token_counts/before_target": 2136.75, + "token_counts/before_think": 897.75 + }, + { + "avg_penalty/after_target": 2.7636443972587585, + "avg_penalty/after_think": 2.9443319439888, + "avg_penalty/before_target": 0.2853579521179199, + "avg_penalty/before_think": 0.4892891198396683, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.5, + "completions/max_terminated_length": 455.5, + "completions/mean_length": 204.703125, + "completions/mean_terminated_length": 204.703125, + "completions/min_length": 86.5, + "completions/min_terminated_length": 86.5, + "epoch": 0.2875, + "grad_norm": 3.9636685848236084, + "kl": 15.15625, + "learning_rate": 1.7944146205354182e-05, + "loss": 1.297, + "num_tokens": 20407569.0, + "reward": 1.37890625, + "reward_std": 0.8998493701219559, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.48456869274377823, + "rewards/tag_count_reward/mean": 0.72265625, + "rewards/tag_count_reward/std": 0.43951142579317093, + "step": 575, + "token_counts/after_target": 469.5, + "token_counts/after_think": 35.75, + "token_counts/before_target": 1749.5, + "token_counts/before_think": 1020.5 + }, + { + "avg_penalty/after_target": 2.812689244747162, + "avg_penalty/after_think": 1.9511286616325378, + "avg_penalty/before_target": 0.3455091491341591, + "avg_penalty/before_think": 0.4471807889640331, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.75, + "completions/max_terminated_length": 565.75, + "completions/mean_length": 236.21875, + "completions/mean_terminated_length": 236.21875, + "completions/min_length": 98.25, + "completions/min_terminated_length": 98.25, + "epoch": 0.288, + "grad_norm": 6.2186455726623535, + "kl": 26.46875, + "learning_rate": 1.7933533402912354e-05, + "loss": 2.0727, + "num_tokens": 20431071.0, + "reward": 1.3828125, + "reward_std": 0.9154660999774933, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.48148179799318314, + "rewards/tag_count_reward/mean": 0.7109375, + "rewards/tag_count_reward/std": 0.4417620301246643, + "step": 576, + "token_counts/after_target": 558.0, + "token_counts/after_think": 82.5, + "token_counts/before_target": 2290.75, + "token_counts/before_think": 848.25 + }, + { + "avg_penalty/after_target": 2.970370352268219, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.2604646123945713, + "avg_penalty/before_think": 0.3863586559891701, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 194.015625, + "completions/mean_terminated_length": 194.015625, + "completions/min_length": 72.75, + "completions/min_terminated_length": 72.75, + "epoch": 0.2885, + "grad_norm": 4.4600043296813965, + "kl": 17.65625, + "learning_rate": 1.792289643355191e-05, + "loss": 1.556, + "num_tokens": 20452224.0, + "reward": 1.578125, + "reward_std": 0.8113718628883362, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.422013059258461, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.39047500491142273, + "step": 577, + "token_counts/after_target": 410.25, + "token_counts/after_think": 70.0, + "token_counts/before_target": 1844.25, + "token_counts/before_think": 779.75 + }, + { + "avg_penalty/after_target": 2.2813306152820587, + "avg_penalty/after_think": 3.943144142627716, + "avg_penalty/before_target": 0.2648808881640434, + "avg_penalty/before_think": 0.4291974827647209, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.25, + "completions/max_terminated_length": 404.25, + "completions/mean_length": 154.21875, + "completions/mean_terminated_length": 154.21875, + "completions/min_length": 66.5, + "completions/min_terminated_length": 66.5, + "epoch": 0.289, + "grad_norm": 3.054342269897461, + "kl": 18.6875, + "learning_rate": 1.7912235329674903e-05, + "loss": 1.6018, + "num_tokens": 20473886.0, + "reward": 1.66015625, + "reward_std": 0.7120237052440643, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.37937305867671967, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.35066529363393784, + "step": 578, + "token_counts/after_target": 223.25, + "token_counts/after_think": 32.75, + "token_counts/before_target": 1533.0, + "token_counts/before_think": 678.5 + }, + { + "avg_penalty/after_target": 2.1814757883548737, + "avg_penalty/after_think": 3.8952863216400146, + "avg_penalty/before_target": 0.39192772656679153, + "avg_penalty/before_think": 0.42658811807632446, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.75, + "completions/max_terminated_length": 460.75, + "completions/mean_length": 196.859375, + "completions/mean_terminated_length": 196.859375, + "completions/min_length": 80.25, + "completions/min_terminated_length": 80.25, + "epoch": 0.2895, + "grad_norm": 5.361574649810791, + "kl": 20.1015625, + "learning_rate": 1.7901550123756906e-05, + "loss": 1.6792, + "num_tokens": 20496421.0, + "reward": 1.48828125, + "reward_std": 0.7012887895107269, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.37276528775691986, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.34932541847229004, + "step": 579, + "token_counts/after_target": 641.5, + "token_counts/after_think": 45.5, + "token_counts/before_target": 1779.75, + "token_counts/before_think": 683.0 + }, + { + "avg_penalty/after_target": 2.8339864015579224, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.2550851069390774, + "avg_penalty/before_think": 0.37140800803899765, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.25, + "completions/max_terminated_length": 350.25, + "completions/mean_length": 157.9375, + "completions/mean_terminated_length": 157.9375, + "completions/min_length": 73.5, + "completions/min_terminated_length": 73.5, + "epoch": 0.29, + "grad_norm": 8.713430404663086, + "kl": 10.75, + "learning_rate": 1.789084084834691e-05, + "loss": 1.1699, + "num_tokens": 20514657.0, + "reward": 1.6953125, + "reward_std": 0.6496351510286331, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.39123913645744324, + "rewards/tag_count_reward/mean": 0.8828125, + "rewards/tag_count_reward/std": 0.29404380172491074, + "step": 580, + "token_counts/after_target": 282.5, + "token_counts/after_think": 39.25, + "token_counts/before_target": 1322.25, + "token_counts/before_think": 883.0 + }, + { + "avg_penalty/after_target": 2.5666386783123016, + "avg_penalty/after_think": 3.880241572856903, + "avg_penalty/before_target": 0.34070657193660736, + "avg_penalty/before_think": 0.39214567095041275, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.5, + "completions/max_terminated_length": 448.5, + "completions/mean_length": 185.671875, + "completions/mean_terminated_length": 185.671875, + "completions/min_length": 70.5, + "completions/min_terminated_length": 70.5, + "epoch": 0.2905, + "grad_norm": 9.48511791229248, + "kl": 12.8271484375, + "learning_rate": 1.788010753606722e-05, + "loss": 1.3317, + "num_tokens": 20537292.0, + "reward": 1.65234375, + "reward_std": 0.6847055107355118, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.87109375, + "rewards/tag_count_reward/std": 0.3052363768219948, + "step": 581, + "token_counts/after_target": 397.5, + "token_counts/after_think": 77.0, + "token_counts/before_target": 1332.5, + "token_counts/before_think": 1163.75 + }, + { + "avg_penalty/after_target": 2.671626001596451, + "avg_penalty/after_think": 3.4360141158103943, + "avg_penalty/before_target": 0.4164108857512474, + "avg_penalty/before_think": 0.45017270743846893, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 801.75, + "completions/max_terminated_length": 778.75, + "completions/mean_length": 308.703125, + "completions/mean_terminated_length": 287.3572998046875, + "completions/min_length": 80.25, + "completions/min_terminated_length": 80.25, + "epoch": 0.291, + "grad_norm": 7.256168842315674, + "kl": 25.720703125, + "learning_rate": 1.7869350219613375e-05, + "loss": 2.0844, + "num_tokens": 20568057.0, + "reward": 1.4375, + "reward_std": 0.8137153834104538, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.43095622956752777, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.39760398864746094, + "step": 582, + "token_counts/after_target": 1344.25, + "token_counts/after_think": 76.5, + "token_counts/before_target": 2322.5, + "token_counts/before_think": 1196.0 + }, + { + "avg_penalty/after_target": 2.7904539108276367, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.321774497628212, + "avg_penalty/before_think": 0.5020023882389069, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.5, + "completions/max_terminated_length": 614.5, + "completions/mean_length": 241.78125, + "completions/mean_terminated_length": 241.78125, + "completions/min_length": 95.25, + "completions/min_terminated_length": 95.25, + "epoch": 0.2915, + "grad_norm": 7.256039619445801, + "kl": 23.3125, + "learning_rate": 1.785856893175402e-05, + "loss": 1.9676, + "num_tokens": 20597467.0, + "reward": 1.48828125, + "reward_std": 0.8308742791414261, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44938503205776215, + "rewards/tag_count_reward/mean": 0.75390625, + "rewards/tag_count_reward/std": 0.3959467485547066, + "step": 583, + "token_counts/after_target": 816.0, + "token_counts/after_think": 74.0, + "token_counts/before_target": 1901.75, + "token_counts/before_think": 1076.75 + }, + { + "avg_penalty/after_target": 2.46078422665596, + "avg_penalty/after_think": 2.942594528198242, + "avg_penalty/before_target": 0.3338734358549118, + "avg_penalty/before_think": 0.35240917280316353, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 235.4375, + "completions/mean_terminated_length": 235.4375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.292, + "grad_norm": 22.391345977783203, + "kl": 30.625, + "learning_rate": 1.784776370533083e-05, + "loss": 1.9289, + "num_tokens": 20620935.0, + "reward": 1.0546875, + "reward_std": 0.909901037812233, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.46875, + "rewards/format_reward/std": 0.5122983306646347, + "rewards/tag_count_reward/mean": 0.5859375, + "rewards/tag_count_reward/std": 0.44348353892564774, + "step": 584, + "token_counts/after_target": 543.5, + "token_counts/after_think": 33.0, + "token_counts/before_target": 2239.5, + "token_counts/before_think": 951.0 + }, + { + "avg_penalty/after_target": 1.963227778673172, + "avg_penalty/after_think": 3.8536905646324158, + "avg_penalty/before_target": 0.593417227268219, + "avg_penalty/before_think": 0.5342993810772896, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 807.5, + "completions/max_terminated_length": 692.0, + "completions/mean_length": 321.109375, + "completions/mean_terminated_length": 299.7745590209961, + "completions/min_length": 79.5, + "completions/min_terminated_length": 79.5, + "epoch": 0.2925, + "grad_norm": 9.980775833129883, + "kl": 22.9375, + "learning_rate": 1.78369345732584e-05, + "loss": 1.8811, + "num_tokens": 20652462.0, + "reward": 1.09765625, + "reward_std": 0.8824861198663712, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.484375, + "rewards/format_reward/std": 0.5092606842517853, + "rewards/tag_count_reward/mean": 0.61328125, + "rewards/tag_count_reward/std": 0.40347376465797424, + "step": 585, + "token_counts/after_target": 1220.75, + "token_counts/after_think": 219.75, + "token_counts/before_target": 2376.0, + "token_counts/before_think": 1321.25 + }, + { + "avg_penalty/after_target": 2.4583696722984314, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.6316420808434486, + "avg_penalty/before_think": 0.5765171274542809, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 734.0, + "completions/max_terminated_length": 732.0, + "completions/mean_length": 391.609375, + "completions/mean_terminated_length": 371.1430358886719, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.293, + "grad_norm": 35.725425720214844, + "kl": 43.0, + "learning_rate": 1.782608156852414e-05, + "loss": 2.6059, + "num_tokens": 20688405.0, + "reward": 0.60546875, + "reward_std": 0.5445433333516121, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.2825859263539314, + "rewards/tag_count_reward/mean": 0.48046875, + "rewards/tag_count_reward/std": 0.3261164724826813, + "step": 586, + "token_counts/after_target": 1886.0, + "token_counts/after_think": 160.25, + "token_counts/before_target": 2287.25, + "token_counts/before_think": 1932.25 + }, + { + "avg_penalty/after_target": 2.4384475350379944, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.4035560414195061, + "avg_penalty/before_think": 0.5033416897058487, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 624.75, + "completions/max_terminated_length": 624.75, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 53.75, + "completions/min_terminated_length": 53.75, + "epoch": 0.2935, + "grad_norm": 33.01783752441406, + "kl": 35.875, + "learning_rate": 1.781520472418819e-05, + "loss": 2.12, + "num_tokens": 20717811.0, + "reward": 0.453125, + "reward_std": 0.37408870458602905, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.421875, + "rewards/tag_count_reward/std": 0.29214753583073616, + "step": 587, + "token_counts/after_target": 850.75, + "token_counts/after_think": 92.0, + "token_counts/before_target": 2073.5, + "token_counts/before_think": 1319.25 + }, + { + "avg_penalty/after_target": 2.6372770369052887, + "avg_penalty/after_think": 3.991099178791046, + "avg_penalty/before_target": 0.4523390084505081, + "avg_penalty/before_think": 0.7293218076229095, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 918.5, + "completions/max_terminated_length": 886.5, + "completions/mean_length": 397.875, + "completions/mean_terminated_length": 388.71771240234375, + "completions/min_length": 96.25, + "completions/min_terminated_length": 96.25, + "epoch": 0.294, + "grad_norm": 8.047394752502441, + "kl": 28.03125, + "learning_rate": 1.7804304073383298e-05, + "loss": 2.1087, + "num_tokens": 20752091.0, + "reward": 0.515625, + "reward_std": 0.49021872133016586, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.23328252136707306, + "rewards/tag_count_reward/mean": 0.4375, + "rewards/tag_count_reward/std": 0.3165765404701233, + "step": 588, + "token_counts/after_target": 1263.5, + "token_counts/after_think": 437.0, + "token_counts/before_target": 3121.75, + "token_counts/before_think": 1543.75 + }, + { + "avg_penalty/after_target": 1.983326017856598, + "avg_penalty/after_think": 3.7934760451316833, + "avg_penalty/before_target": 0.45419953018426895, + "avg_penalty/before_think": 0.6212128102779388, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 729.75, + "completions/max_terminated_length": 633.5, + "completions/mean_length": 323.921875, + "completions/mean_terminated_length": 313.7822952270508, + "completions/min_length": 84.25, + "completions/min_terminated_length": 84.25, + "epoch": 0.2945, + "grad_norm": 3.3483972549438477, + "kl": 15.8125, + "learning_rate": 1.7793379649314743e-05, + "loss": 1.2874, + "num_tokens": 20782406.0, + "reward": 0.6171875, + "reward_std": 0.4600980430841446, + "rewards/accuracy_reward/mean": NaN, + "rewards/accuracy_reward/std": NaN, + "rewards/format_reward/mean": 0.09375, + "rewards/format_reward/std": 0.23680340498685837, + "rewards/tag_count_reward/mean": 0.5234375, + "rewards/tag_count_reward/std": 0.2799885608255863, + "step": 589, + "token_counts/after_target": 926.25, + "token_counts/after_think": 214.25, + "token_counts/before_target": 1960.5, + "token_counts/before_think": 2081.75 + }, + { + "avg_penalty/after_target": 2.4862197041511536, + "avg_penalty/after_think": 3.8949854373931885, + "avg_penalty/before_target": 0.291178897023201, + "avg_penalty/before_think": 0.5362875908613205, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 325.78125, + "completions/mean_terminated_length": 325.78125, + "completions/min_length": 55.5, + "completions/min_terminated_length": 55.5, + "epoch": 0.295, + "grad_norm": 10.867396354675293, + "kl": 9.0546875, + "learning_rate": 1.778243148526021e-05, + "loss": 1.0666, + "num_tokens": 20816872.0, + "reward": 0.796875, + "reward_std": 0.6291016638278961, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.1875, + "rewards/format_reward/std": 0.3987511098384857, + "rewards/tag_count_reward/mean": 0.609375, + "rewards/tag_count_reward/std": 0.30660153180360794, + "step": 590, + "token_counts/after_target": 827.0, + "token_counts/after_think": 244.0, + "token_counts/before_target": 2308.0, + "token_counts/before_think": 1833.5 + }, + { + "avg_penalty/after_target": 2.3606656789779663, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4360646829009056, + "avg_penalty/before_think": 0.5770666524767876, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 687.5, + "completions/max_terminated_length": 588.5, + "completions/mean_length": 304.3125, + "completions/mean_terminated_length": 294.0500030517578, + "completions/min_length": 96.5, + "completions/min_terminated_length": 96.5, + "epoch": 0.2955, + "grad_norm": 19.375835418701172, + "kl": 5.91796875, + "learning_rate": 1.777145961456971e-05, + "loss": 1.0491, + "num_tokens": 20846684.0, + "reward": 1.00390625, + "reward_std": 0.7231091856956482, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4745560586452484, + "rewards/tag_count_reward/mean": 0.66015625, + "rewards/tag_count_reward/std": 0.3195497915148735, + "step": 591, + "token_counts/after_target": 932.5, + "token_counts/after_think": 247.0, + "token_counts/before_target": 2148.5, + "token_counts/before_think": 1541.0 + }, + { + "avg_penalty/after_target": 2.730426549911499, + "avg_penalty/after_think": 1.8893683552742004, + "avg_penalty/before_target": 0.32035763561725616, + "avg_penalty/before_think": 0.4811346232891083, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.25, + "completions/max_terminated_length": 494.25, + "completions/mean_length": 243.984375, + "completions/mean_terminated_length": 243.984375, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.296, + "grad_norm": 10.594085693359375, + "kl": 4.23828125, + "learning_rate": 1.776046407066546e-05, + "loss": 0.6755, + "num_tokens": 20873003.0, + "reward": 0.93359375, + "reward_std": 0.7077739238739014, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.3125, + "rewards/format_reward/std": 0.46566852182149887, + "rewards/tag_count_reward/mean": 0.62109375, + "rewards/tag_count_reward/std": 0.3370004817843437, + "step": 592, + "token_counts/after_target": 488.75, + "token_counts/after_think": 127.75, + "token_counts/before_target": 1801.25, + "token_counts/before_think": 1486.0 + }, + { + "avg_penalty/after_target": 2.64183309674263, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.41448603942990303, + "avg_penalty/before_think": 0.5389158353209496, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 310.078125, + "completions/mean_terminated_length": 310.078125, + "completions/min_length": 101.5, + "completions/min_terminated_length": 101.5, + "epoch": 0.2965, + "grad_norm": 16.165706634521484, + "kl": 3.40625, + "learning_rate": 1.7749444887041797e-05, + "loss": 0.7943, + "num_tokens": 20906352.0, + "reward": 0.99609375, + "reward_std": 0.7666159868240356, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.48935678601264954, + "rewards/tag_count_reward/mean": 0.63671875, + "rewards/tag_count_reward/std": 0.35345207154750824, + "step": 593, + "token_counts/after_target": 872.25, + "token_counts/after_think": 58.5, + "token_counts/before_target": 2212.5, + "token_counts/before_think": 1818.0 + }, + { + "avg_penalty/after_target": 2.9583473801612854, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.31904226168990135, + "avg_penalty/before_think": 0.3199441023170948, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.5, + "completions/max_terminated_length": 433.5, + "completions/mean_length": 256.484375, + "completions/mean_terminated_length": 256.484375, + "completions/min_length": 62.5, + "completions/min_terminated_length": 62.5, + "epoch": 0.297, + "grad_norm": 13.294779777526855, + "kl": 4.484375, + "learning_rate": 1.7738402097265063e-05, + "loss": 0.6807, + "num_tokens": 20931311.0, + "reward": 0.94140625, + "reward_std": 0.7461530417203903, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4622559919953346, + "rewards/tag_count_reward/mean": 0.59765625, + "rewards/tag_count_reward/std": 0.3701053112745285, + "step": 594, + "token_counts/after_target": 612.25, + "token_counts/after_think": 27.5, + "token_counts/before_target": 1897.75, + "token_counts/before_think": 1566.25 + }, + { + "avg_penalty/after_target": 2.519028663635254, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3401264362037182, + "avg_penalty/before_think": 0.5730783566832542, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.75, + "completions/max_terminated_length": 454.75, + "completions/mean_length": 256.75, + "completions/mean_terminated_length": 256.75, + "completions/min_length": 74.25, + "completions/min_terminated_length": 74.25, + "epoch": 0.2975, + "grad_norm": 16.67584800720215, + "kl": 4.56640625, + "learning_rate": 1.7727335734973512e-05, + "loss": 0.9351, + "num_tokens": 20957999.0, + "reward": 0.98046875, + "reward_std": 0.7857044190168381, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.48935678601264954, + "rewards/tag_count_reward/mean": 0.63671875, + "rewards/tag_count_reward/std": 0.38623272627592087, + "step": 595, + "token_counts/after_target": 525.25, + "token_counts/after_think": 280.5, + "token_counts/before_target": 2020.0, + "token_counts/before_think": 1282.25 + }, + { + "avg_penalty/after_target": 2.092422664165497, + "avg_penalty/after_think": 2.9157742261886597, + "avg_penalty/before_target": 0.32382508367300034, + "avg_penalty/before_think": 0.6249373257160187, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.5, + "completions/max_terminated_length": 496.5, + "completions/mean_length": 274.8125, + "completions/mean_terminated_length": 274.8125, + "completions/min_length": 53.25, + "completions/min_terminated_length": 53.25, + "epoch": 0.298, + "grad_norm": 8.859983444213867, + "kl": 6.4140625, + "learning_rate": 1.7716245833877202e-05, + "loss": 0.8089, + "num_tokens": 20983411.0, + "reward": 0.8984375, + "reward_std": 0.8283812999725342, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4745560586452484, + "rewards/tag_count_reward/mean": 0.5390625, + "rewards/tag_count_reward/std": 0.4274003356695175, + "step": 596, + "token_counts/after_target": 712.0, + "token_counts/after_think": 170.5, + "token_counts/before_target": 2530.25, + "token_counts/before_think": 984.25 + }, + { + "avg_penalty/after_target": 2.248891294002533, + "avg_penalty/after_think": 2.7709458470344543, + "avg_penalty/before_target": 0.301809124648571, + "avg_penalty/before_think": 0.5890868008136749, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.25, + "completions/max_terminated_length": 498.25, + "completions/mean_length": 238.65625, + "completions/mean_terminated_length": 238.65625, + "completions/min_length": 62.75, + "completions/min_terminated_length": 62.75, + "epoch": 0.2985, + "grad_norm": 7.073070049285889, + "kl": 6.8203125, + "learning_rate": 1.7705132427757895e-05, + "loss": 0.849, + "num_tokens": 21008013.0, + "reward": 1.01171875, + "reward_std": 0.8607723116874695, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.421875, + "rewards/format_reward/std": 0.5092606842517853, + "rewards/tag_count_reward/mean": 0.58984375, + "rewards/tag_count_reward/std": 0.40325386077165604, + "step": 597, + "token_counts/after_target": 588.5, + "token_counts/after_think": 186.75, + "token_counts/before_target": 2063.5, + "token_counts/before_think": 979.75 + }, + { + "avg_penalty/after_target": 2.738856554031372, + "avg_penalty/after_think": 0.9943000078201294, + "avg_penalty/before_target": 0.31888508051633835, + "avg_penalty/before_think": 0.4069531299173832, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.75, + "completions/max_terminated_length": 466.75, + "completions/mean_length": 214.171875, + "completions/mean_terminated_length": 214.171875, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.299, + "grad_norm": 5.809467315673828, + "kl": 11.875, + "learning_rate": 1.7693995550468952e-05, + "loss": 1.0556, + "num_tokens": 21031992.0, + "reward": 0.90234375, + "reward_std": 0.8681828677654266, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.359375, + "rewards/format_reward/std": 0.49244368076324463, + "rewards/tag_count_reward/mean": 0.54296875, + "rewards/tag_count_reward/std": 0.4275354593992233, + "step": 598, + "token_counts/after_target": 546.5, + "token_counts/after_think": 17.75, + "token_counts/before_target": 1953.5, + "token_counts/before_think": 909.0 + }, + { + "avg_penalty/after_target": 2.771789252758026, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 0.3220367766916752, + "avg_penalty/before_think": 0.4100370965898037, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 527.25, + "completions/max_terminated_length": 527.25, + "completions/mean_length": 248.71875, + "completions/mean_terminated_length": 248.71875, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.2995, + "grad_norm": 7.468502044677734, + "kl": 14.296875, + "learning_rate": 1.7682835235935236e-05, + "loss": 1.1252, + "num_tokens": 21059830.0, + "reward": 1.0078125, + "reward_std": 0.8934349417686462, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.40625, + "rewards/format_reward/std": 0.4970766380429268, + "rewards/tag_count_reward/mean": 0.5859375, + "rewards/tag_count_reward/std": 0.42024140805006027, + "step": 599, + "token_counts/after_target": 707.25, + "token_counts/after_think": 0.0, + "token_counts/before_target": 2002.75, + "token_counts/before_think": 1269.5 + }, + { + "avg_penalty/after_target": 2.6001794636249542, + "avg_penalty/after_think": 2.975728392601013, + "avg_penalty/before_target": 0.23461619392037392, + "avg_penalty/before_think": 0.480318620800972, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.25, + "completions/max_terminated_length": 409.25, + "completions/mean_length": 212.921875, + "completions/mean_terminated_length": 212.921875, + "completions/min_length": 50.25, + "completions/min_terminated_length": 50.25, + "epoch": 0.3, + "grad_norm": 7.04196310043335, + "kl": 13.5546875, + "learning_rate": 1.7671651518153e-05, + "loss": 0.9977, + "num_tokens": 21085665.0, + "reward": 1.12890625, + "reward_std": 0.8778095245361328, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.515625, + "rewards/format_reward/std": 0.5028772801160812, + "rewards/tag_count_reward/mean": 0.61328125, + "rewards/tag_count_reward/std": 0.4200674593448639, + "step": 600, + "token_counts/after_target": 365.5, + "token_counts/after_think": 101.25, + "token_counts/before_target": 1721.25, + "token_counts/before_think": 1218.75 + }, + { + "avg_penalty/after_target": 2.357261836528778, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.32071269676089287, + "avg_penalty/before_think": 0.5404127016663551, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.5, + "completions/max_terminated_length": 634.5, + "completions/mean_length": 264.453125, + "completions/mean_terminated_length": 264.453125, + "completions/min_length": 70.25, + "completions/min_terminated_length": 70.25, + "epoch": 0.3005, + "grad_norm": 7.8499064445495605, + "kl": 16.984375, + "learning_rate": 1.766044443118978e-05, + "loss": 1.2883, + "num_tokens": 21111614.0, + "reward": 1.20703125, + "reward_std": 0.9182507544755936, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.5049516260623932, + "rewards/tag_count_reward/mean": 0.66015625, + "rewards/tag_count_reward/std": 0.4460701271891594, + "step": 601, + "token_counts/after_target": 610.25, + "token_counts/after_think": 128.75, + "token_counts/before_target": 1992.0, + "token_counts/before_think": 1500.25 + }, + { + "avg_penalty/after_target": 2.877059727907181, + "avg_penalty/after_think": 2.802779793739319, + "avg_penalty/before_target": 0.40018805116415024, + "avg_penalty/before_think": 0.4681413024663925, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 560.75, + "completions/max_terminated_length": 560.75, + "completions/mean_length": 256.84375, + "completions/mean_terminated_length": 256.84375, + "completions/min_length": 43.75, + "completions/min_terminated_length": 43.75, + "epoch": 0.301, + "grad_norm": 13.689253807067871, + "kl": 25.03125, + "learning_rate": 1.7649214009184323e-05, + "loss": 1.7451, + "num_tokens": 21137300.0, + "reward": 0.953125, + "reward_std": 0.9114709198474884, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.40625, + "rewards/format_reward/std": 0.5040994435548782, + "rewards/tag_count_reward/mean": 0.546875, + "rewards/tag_count_reward/std": 0.44585201889276505, + "step": 602, + "token_counts/after_target": 899.25, + "token_counts/after_think": 43.5, + "token_counts/before_target": 2093.25, + "token_counts/before_think": 1073.5 + }, + { + "avg_penalty/after_target": 2.7911716401576996, + "avg_penalty/after_think": 3.9891729950904846, + "avg_penalty/before_target": 0.2489902600646019, + "avg_penalty/before_think": 0.4475402608513832, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.5, + "completions/max_terminated_length": 353.5, + "completions/mean_length": 203.28125, + "completions/mean_terminated_length": 203.28125, + "completions/min_length": 73.5, + "completions/min_terminated_length": 73.5, + "epoch": 0.3015, + "grad_norm": 8.313544273376465, + "kl": 16.5703125, + "learning_rate": 1.7637960286346423e-05, + "loss": 1.2254, + "num_tokens": 21160054.0, + "reward": 1.3671875, + "reward_std": 0.886958509683609, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.47663040459156036, + "rewards/tag_count_reward/mean": 0.7109375, + "rewards/tag_count_reward/std": 0.4258546680212021, + "step": 603, + "token_counts/after_target": 273.5, + "token_counts/after_think": 131.0, + "token_counts/before_target": 1825.75, + "token_counts/before_think": 1022.25 + }, + { + "avg_penalty/after_target": 3.2005704939365387, + "avg_penalty/after_think": 3.9139281511306763, + "avg_penalty/before_target": 0.2789985314011574, + "avg_penalty/before_think": 0.46023040637373924, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.5, + "completions/max_terminated_length": 420.5, + "completions/mean_length": 212.203125, + "completions/mean_terminated_length": 212.203125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.302, + "grad_norm": 2.8062376976013184, + "kl": 13.671875, + "learning_rate": 1.7626683296956885e-05, + "loss": 1.1978, + "num_tokens": 21183667.0, + "reward": 1.42578125, + "reward_std": 0.856960341334343, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.479247085750103, + "rewards/tag_count_reward/mean": 0.75390625, + "rewards/tag_count_reward/std": 0.4048111066222191, + "step": 604, + "token_counts/after_target": 322.75, + "token_counts/after_think": 133.0, + "token_counts/before_target": 1890.0, + "token_counts/before_think": 1049.5 + }, + { + "avg_penalty/after_target": 2.5606325268745422, + "avg_penalty/after_think": 3.8487566113471985, + "avg_penalty/before_target": 0.3754763714969158, + "avg_penalty/before_think": 0.37808577716350555, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 580.0, + "completions/max_terminated_length": 450.25, + "completions/mean_length": 269.25, + "completions/mean_terminated_length": 257.6635437011719, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.3025, + "grad_norm": 2.8280439376831055, + "kl": 17.203125, + "learning_rate": 1.761538307536737e-05, + "loss": 1.3951, + "num_tokens": 21210099.0, + "reward": 1.234375, + "reward_std": 0.9380689412355423, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.5071863383054733, + "rewards/tag_count_reward/mean": 0.65625, + "rewards/tag_count_reward/std": 0.4557093232870102, + "step": 605, + "token_counts/after_target": 659.75, + "token_counts/after_think": 60.75, + "token_counts/before_target": 2288.0, + "token_counts/before_think": 1299.5 + }, + { + "avg_penalty/after_target": 2.727519690990448, + "avg_penalty/after_think": 2.6431097388267517, + "avg_penalty/before_target": 0.35586392879486084, + "avg_penalty/before_think": 0.48092585802078247, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 563.0, + "completions/max_terminated_length": 563.0, + "completions/mean_length": 241.15625, + "completions/mean_terminated_length": 241.15625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.303, + "grad_norm": 14.972906112670898, + "kl": 12.25, + "learning_rate": 1.7604059656000313e-05, + "loss": 1.4939, + "num_tokens": 21233869.0, + "reward": 1.58984375, + "reward_std": 0.7657531648874283, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.43303824216127396, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.3522355556488037, + "step": 606, + "token_counts/after_target": 771.25, + "token_counts/after_think": 113.25, + "token_counts/before_target": 1634.5, + "token_counts/before_think": 1339.5 + }, + { + "avg_penalty/after_target": 2.512641489505768, + "avg_penalty/after_think": 1.6102482080459595, + "avg_penalty/before_target": 0.4448801279067993, + "avg_penalty/before_think": 0.45468390733003616, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 256.65625, + "completions/mean_terminated_length": 256.65625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.3035, + "grad_norm": 4.506380558013916, + "kl": 20.828125, + "learning_rate": 1.759271307334881e-05, + "loss": 1.7706, + "num_tokens": 21260919.0, + "reward": 1.26953125, + "reward_std": 0.9430984556674957, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.49776528775691986, + "rewards/tag_count_reward/mean": 0.66015625, + "rewards/tag_count_reward/std": 0.4465281814336777, + "step": 607, + "token_counts/after_target": 928.5, + "token_counts/after_think": 47.75, + "token_counts/before_target": 2064.0, + "token_counts/before_think": 1066.25 + }, + { + "avg_penalty/after_target": 3.265488922595978, + "avg_penalty/after_think": 2.7790289521217346, + "avg_penalty/before_target": 0.3542807847261429, + "avg_penalty/before_think": 0.3829032927751541, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.5, + "completions/max_terminated_length": 642.5, + "completions/mean_length": 243.3125, + "completions/mean_terminated_length": 243.3125, + "completions/min_length": 113.5, + "completions/min_terminated_length": 113.5, + "epoch": 0.304, + "grad_norm": 12.38314151763916, + "kl": 20.15625, + "learning_rate": 1.7581343361976523e-05, + "loss": 1.9675, + "num_tokens": 21286955.0, + "reward": 1.453125, + "reward_std": 0.8948119580745697, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4761601909995079, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.41515372693538666, + "step": 608, + "token_counts/after_target": 1006.0, + "token_counts/after_think": 60.5, + "token_counts/before_target": 1678.0, + "token_counts/before_think": 1148.5 + }, + { + "avg_penalty/after_target": 2.5127811431884766, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4213351272046566, + "avg_penalty/before_think": 0.5404777079820633, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 701.0, + "completions/max_terminated_length": 701.0, + "completions/mean_length": 306.28125, + "completions/mean_terminated_length": 306.28125, + "completions/min_length": 109.5, + "completions/min_terminated_length": 109.5, + "epoch": 0.3045, + "grad_norm": 5.0553483963012695, + "kl": 27.71875, + "learning_rate": 1.7569950556517566e-05, + "loss": 2.2027, + "num_tokens": 21317885.0, + "reward": 1.3046875, + "reward_std": 0.9127569496631622, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.48866813629865646, + "rewards/tag_count_reward/mean": 0.6796875, + "rewards/tag_count_reward/std": 0.43623413145542145, + "step": 609, + "token_counts/after_target": 1177.5, + "token_counts/after_think": 109.25, + "token_counts/before_target": 2545.25, + "token_counts/before_think": 1068.5 + }, + { + "avg_penalty/after_target": 2.997871458530426, + "avg_penalty/after_think": 1.5751250982284546, + "avg_penalty/before_target": 0.3780042976140976, + "avg_penalty/before_think": 0.38848819583654404, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 619.25, + "completions/max_terminated_length": 619.25, + "completions/mean_length": 259.234375, + "completions/mean_terminated_length": 259.234375, + "completions/min_length": 116.5, + "completions/min_terminated_length": 116.5, + "epoch": 0.305, + "grad_norm": 2.9858713150024414, + "kl": 22.78125, + "learning_rate": 1.7558534691676396e-05, + "loss": 1.9345, + "num_tokens": 21343420.0, + "reward": 1.3125, + "reward_std": 0.8724757432937622, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.4876555874943733, + "rewards/tag_count_reward/mean": 0.703125, + "rewards/tag_count_reward/std": 0.41280537843704224, + "step": 610, + "token_counts/after_target": 877.5, + "token_counts/after_think": 32.0, + "token_counts/before_target": 1865.75, + "token_counts/before_think": 1372.5 + }, + { + "avg_penalty/after_target": 2.4315590858459473, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.30668048560619354, + "avg_penalty/before_think": 0.3712595924735069, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 619.75, + "completions/max_terminated_length": 619.75, + "completions/mean_length": 254.6875, + "completions/mean_terminated_length": 254.6875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.3055, + "grad_norm": 7.769923210144043, + "kl": 18.546875, + "learning_rate": 1.7547095802227723e-05, + "loss": 1.7753, + "num_tokens": 21368184.0, + "reward": 1.58203125, + "reward_std": 0.7643758803606033, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4229728877544403, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.3583908975124359, + "step": 611, + "token_counts/after_target": 810.5, + "token_counts/after_think": 78.75, + "token_counts/before_target": 1703.75, + "token_counts/before_think": 1482.0 + }, + { + "avg_penalty/after_target": 2.6512100100517273, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4964364133775234, + "avg_penalty/before_think": 0.3257448077201843, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 829.5, + "completions/max_terminated_length": 617.75, + "completions/mean_length": 273.671875, + "completions/mean_terminated_length": 250.56459045410156, + "completions/min_length": 87.75, + "completions/min_terminated_length": 87.75, + "epoch": 0.306, + "grad_norm": 4.653458595275879, + "kl": 24.4609375, + "learning_rate": 1.7535633923016382e-05, + "loss": 2.1431, + "num_tokens": 21398291.0, + "reward": 1.5390625, + "reward_std": 0.8964362442493439, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.1632782220840454, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.44495995342731476, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.39578716456890106, + "step": 612, + "token_counts/after_target": 1064.5, + "token_counts/after_think": 135.5, + "token_counts/before_target": 1841.0, + "token_counts/before_think": 1337.75 + }, + { + "avg_penalty/after_target": 2.631545126438141, + "avg_penalty/after_think": 3.839597702026367, + "avg_penalty/before_target": 0.290009792894125, + "avg_penalty/before_think": 0.48876622319221497, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.75, + "completions/max_terminated_length": 534.75, + "completions/mean_length": 267.828125, + "completions/mean_terminated_length": 267.828125, + "completions/min_length": 115.75, + "completions/min_terminated_length": 115.75, + "epoch": 0.3065, + "grad_norm": 12.071685791015625, + "kl": 22.5, + "learning_rate": 1.7524149088957244e-05, + "loss": 1.6978, + "num_tokens": 21424984.0, + "reward": 1.45703125, + "reward_std": 0.8808266669511795, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.428548239171505, + "step": 613, + "token_counts/after_target": 589.75, + "token_counts/after_think": 162.75, + "token_counts/before_target": 1938.25, + "token_counts/before_think": 1594.5 + }, + { + "avg_penalty/after_target": 2.5398622155189514, + "avg_penalty/after_think": 3.999111831188202, + "avg_penalty/before_target": 0.5594146400690079, + "avg_penalty/before_think": 0.6016051918268204, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 797.75, + "completions/max_terminated_length": 671.25, + "completions/mean_length": 384.03125, + "completions/mean_terminated_length": 364.8958435058594, + "completions/min_length": 130.25, + "completions/min_terminated_length": 130.25, + "epoch": 0.307, + "grad_norm": 3.774122476577759, + "kl": 19.203125, + "learning_rate": 1.7512641335035115e-05, + "loss": 1.8362, + "num_tokens": 21461162.0, + "reward": 1.56640625, + "reward_std": 0.831949308514595, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4255262687802315, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.39516496658325195, + "step": 614, + "token_counts/after_target": 1558.0, + "token_counts/after_think": 248.0, + "token_counts/before_target": 1644.25, + "token_counts/before_think": 2694.25 + }, + { + "avg_penalty/after_target": 2.1310352087020874, + "avg_penalty/after_think": 3.86220121383667, + "avg_penalty/before_target": 0.4258900247514248, + "avg_penalty/before_think": 0.5779843032360077, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 692.0, + "completions/max_terminated_length": 563.0, + "completions/mean_length": 344.34375, + "completions/mean_terminated_length": 334.0593795776367, + "completions/min_length": 150.75, + "completions/min_terminated_length": 150.75, + "epoch": 0.3075, + "grad_norm": 7.499375343322754, + "kl": 16.1875, + "learning_rate": 1.7501110696304598e-05, + "loss": 1.3078, + "num_tokens": 21491920.0, + "reward": 1.515625, + "reward_std": 0.8109144419431686, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4160471484065056, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.39892809838056564, + "step": 615, + "token_counts/after_target": 925.25, + "token_counts/after_think": 270.0, + "token_counts/before_target": 1582.75, + "token_counts/before_think": 2731.5 + }, + { + "avg_penalty/after_target": 2.414403200149536, + "avg_penalty/after_think": 2.948558509349823, + "avg_penalty/before_target": 0.24080722033977509, + "avg_penalty/before_think": 0.4041964113712311, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 245.171875, + "completions/mean_terminated_length": 245.171875, + "completions/min_length": 66.75, + "completions/min_terminated_length": 66.75, + "epoch": 0.308, + "grad_norm": 6.074099540710449, + "kl": 12.4375, + "learning_rate": 1.7489557207890025e-05, + "loss": 1.1029, + "num_tokens": 21517915.0, + "reward": 1.609375, + "reward_std": 0.8569508194923401, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.11180340498685837, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4440634250640869, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.3648400977253914, + "step": 616, + "token_counts/after_target": 315.75, + "token_counts/after_think": 291.75, + "token_counts/before_target": 1148.25, + "token_counts/before_think": 2167.0 + }, + { + "avg_penalty/after_target": 1.9176137149333954, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4529617130756378, + "avg_penalty/before_think": 0.6511109173297882, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.25, + "completions/max_terminated_length": 751.25, + "completions/mean_length": 402.65625, + "completions/mean_terminated_length": 402.65625, + "completions/min_length": 213.75, + "completions/min_terminated_length": 213.75, + "epoch": 0.3085, + "grad_norm": 7.793574810028076, + "kl": 15.140625, + "learning_rate": 1.747798090498532e-05, + "loss": 1.4205, + "num_tokens": 21553589.0, + "reward": 1.296875, + "reward_std": 0.8715661317110062, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.497555673122406, + "rewards/tag_count_reward/mean": 0.703125, + "rewards/tag_count_reward/std": 0.4006187170743942, + "step": 617, + "token_counts/after_target": 1052.75, + "token_counts/after_think": 531.25, + "token_counts/before_target": 2242.5, + "token_counts/before_think": 2616.0 + }, + { + "avg_penalty/after_target": 2.247718185186386, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.462823823094368, + "avg_penalty/before_think": 0.2196589671075344, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.0, + "completions/max_terminated_length": 695.0, + "completions/mean_length": 384.34375, + "completions/mean_terminated_length": 384.34375, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.309, + "grad_norm": 25.244068145751953, + "kl": 32.0625, + "learning_rate": 1.7466381822853915e-05, + "loss": 2.0824, + "num_tokens": 21587963.0, + "reward": 0.44921875, + "reward_std": 0.5983192250132561, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.109375, + "rewards/format_reward/std": 0.3186737820506096, + "rewards/tag_count_reward/mean": 0.33984375, + "rewards/tag_count_reward/std": 0.32307565212249756, + "step": 618, + "token_counts/after_target": 1703.25, + "token_counts/after_think": 120.25, + "token_counts/before_target": 3849.0, + "token_counts/before_think": 477.0 + }, + { + "avg_penalty/after_target": 2.7050541043281555, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 0.4731506258249283, + "avg_penalty/before_think": 0.633652538061142, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 805.75, + "completions/max_terminated_length": 748.0, + "completions/mean_length": 451.109375, + "completions/mean_terminated_length": 443.5708465576172, + "completions/min_length": 141.25, + "completions/min_terminated_length": 141.25, + "epoch": 0.3095, + "grad_norm": 22.145334243774414, + "kl": 33.0, + "learning_rate": 1.7454759996828622e-05, + "loss": 2.1913, + "num_tokens": 21626146.0, + "reward": 0.41796875, + "reward_std": 0.5015122145414352, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.23328252136707306, + "rewards/tag_count_reward/mean": 0.33984375, + "rewards/tag_count_reward/std": 0.3058892711997032, + "step": 619, + "token_counts/after_target": 2236.0, + "token_counts/after_think": 0.0, + "token_counts/before_target": 4015.0, + "token_counts/before_think": 966.75 + }, + { + "avg_penalty/after_target": 2.211938202381134, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.45607176423072815, + "avg_penalty/before_think": 0.4526882693171501, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 759.0, + "completions/max_terminated_length": 653.25, + "completions/mean_length": 340.921875, + "completions/mean_terminated_length": 329.42396545410156, + "completions/min_length": 116.5, + "completions/min_terminated_length": 116.5, + "epoch": 0.31, + "grad_norm": 7.365675926208496, + "kl": 17.640625, + "learning_rate": 1.744311546231154e-05, + "loss": 1.5193, + "num_tokens": 21656349.0, + "reward": 0.6171875, + "reward_std": 0.7384621649980545, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.203125, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.4140625, + "rewards/tag_count_reward/std": 0.37052907794713974, + "step": 620, + "token_counts/after_target": 1345.75, + "token_counts/after_think": 62.0, + "token_counts/before_target": 2937.0, + "token_counts/before_think": 1110.0 + }, + { + "avg_penalty/after_target": 2.457347571849823, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.5400941148400307, + "avg_penalty/before_think": 0.5145878195762634, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 896.0, + "completions/max_terminated_length": 737.25, + "completions/mean_length": 476.25, + "completions/mean_terminated_length": 451.1070098876953, + "completions/min_length": 135.5, + "completions/min_terminated_length": 135.5, + "epoch": 0.3105, + "grad_norm": 4.439813137054443, + "kl": 22.1875, + "learning_rate": 1.7431448254773943e-05, + "loss": 1.8475, + "num_tokens": 21695229.0, + "reward": 0.5625, + "reward_std": 0.7072524875402451, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.171875, + "rewards/format_reward/std": 0.38336414843797684, + "rewards/tag_count_reward/mean": 0.375, + "rewards/tag_count_reward/std": 0.35489366203546524, + "step": 621, + "token_counts/after_target": 2775.0, + "token_counts/after_think": 7.25, + "token_counts/before_target": 4106.25, + "token_counts/before_think": 731.5 + }, + { + "avg_penalty/after_target": 2.554678797721863, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 0.5788049846887589, + "avg_penalty/before_think": 0.6530025154352188, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 918.5, + "completions/max_terminated_length": 905.0, + "completions/mean_length": 467.59375, + "completions/mean_terminated_length": 458.08646392822266, + "completions/min_length": 171.75, + "completions/min_terminated_length": 171.75, + "epoch": 0.311, + "grad_norm": 8.508553504943848, + "kl": 24.375, + "learning_rate": 1.7419758409756163e-05, + "loss": 1.841, + "num_tokens": 21736883.0, + "reward": 0.66796875, + "reward_std": 0.7255318462848663, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.203125, + "rewards/format_reward/std": 0.39656074345111847, + "rewards/tag_count_reward/mean": 0.46484375, + "rewards/tag_count_reward/std": 0.39044447243213654, + "step": 622, + "token_counts/after_target": 2518.0, + "token_counts/after_think": 0.0, + "token_counts/before_target": 3391.0, + "token_counts/before_think": 1572.5 + }, + { + "avg_penalty/after_target": 2.4039422273635864, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.4855183884501457, + "avg_penalty/before_think": 0.6263232529163361, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 795.0, + "completions/max_terminated_length": 795.0, + "completions/mean_length": 398.78125, + "completions/mean_terminated_length": 398.78125, + "completions/min_length": 129.5, + "completions/min_terminated_length": 129.5, + "epoch": 0.3115, + "grad_norm": 6.200787544250488, + "kl": 14.015625, + "learning_rate": 1.74080459628675e-05, + "loss": 1.3099, + "num_tokens": 21771285.0, + "reward": 0.61328125, + "reward_std": 0.7226719707250595, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.203125, + "rewards/format_reward/std": 0.39656074345111847, + "rewards/tag_count_reward/mean": 0.41015625, + "rewards/tag_count_reward/std": 0.39090409874916077, + "step": 623, + "token_counts/after_target": 1787.25, + "token_counts/after_think": 128.5, + "token_counts/before_target": 3588.0, + "token_counts/before_think": 876.75 + }, + { + "avg_penalty/after_target": 2.6911401748657227, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.45622114837169647, + "avg_penalty/before_think": 0.6820387691259384, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 746.5, + "completions/max_terminated_length": 641.5, + "completions/mean_length": 348.703125, + "completions/mean_terminated_length": 322.8260498046875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.312, + "grad_norm": 7.240393161773682, + "kl": 12.671875, + "learning_rate": 1.73963109497861e-05, + "loss": 1.1821, + "num_tokens": 21802962.0, + "reward": 0.55859375, + "reward_std": 0.7066723555326462, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.15625, + "rewards/format_reward/std": 0.36483466625213623, + "rewards/tag_count_reward/mean": 0.38671875, + "rewards/tag_count_reward/std": 0.37544044852256775, + "step": 624, + "token_counts/after_target": 1547.0, + "token_counts/after_think": 109.75, + "token_counts/before_target": 2881.25, + "token_counts/before_think": 1041.25 + }, + { + "avg_penalty/after_target": 2.1188776195049286, + "avg_penalty/after_think": 2.986023426055908, + "avg_penalty/before_target": 0.5363095328211784, + "avg_penalty/before_think": 0.5980798304080963, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 667.75, + "completions/max_terminated_length": 667.75, + "completions/mean_length": 367.546875, + "completions/mean_terminated_length": 367.546875, + "completions/min_length": 112.75, + "completions/min_terminated_length": 112.75, + "epoch": 0.3125, + "grad_norm": 12.773438453674316, + "kl": 7.8515625, + "learning_rate": 1.7384553406258842e-05, + "loss": 1.0597, + "num_tokens": 21835797.0, + "reward": 1.15625, + "reward_std": 0.8306039422750473, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5061737895011902, + "rewards/tag_count_reward/mean": 0.65625, + "rewards/tag_count_reward/std": 0.37933869659900665, + "step": 625, + "token_counts/after_target": 1399.5, + "token_counts/after_think": 218.75, + "token_counts/before_target": 2566.5, + "token_counts/before_think": 1696.0 + }, + { + "avg_penalty/after_target": 1.8281890153884888, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5270712822675705, + "avg_penalty/before_think": 0.6586194261908531, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.75, + "completions/max_terminated_length": 637.75, + "completions/mean_length": 351.859375, + "completions/mean_terminated_length": 351.859375, + "completions/min_length": 103.25, + "completions/min_terminated_length": 103.25, + "epoch": 0.313, + "grad_norm": 15.755610466003418, + "kl": 8.6171875, + "learning_rate": 1.737277336810124e-05, + "loss": 1.1904, + "num_tokens": 21869084.0, + "reward": 1.29296875, + "reward_std": 0.9096764028072357, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.4939897432923317, + "rewards/tag_count_reward/mean": 0.66796875, + "rewards/tag_count_reward/std": 0.42305872589349747, + "step": 626, + "token_counts/after_target": 1139.0, + "token_counts/after_think": 319.25, + "token_counts/before_target": 2458.75, + "token_counts/before_think": 1712.75 + }, + { + "avg_penalty/after_target": 1.8405042737722397, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3710635304450989, + "avg_penalty/before_think": 0.6191304922103882, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.5, + "completions/max_terminated_length": 514.5, + "completions/mean_length": 260.375, + "completions/mean_terminated_length": 260.375, + "completions/min_length": 79.5, + "completions/min_terminated_length": 79.5, + "epoch": 0.3135, + "grad_norm": 7.498817443847656, + "kl": 11.5625, + "learning_rate": 1.7360970871197347e-05, + "loss": 1.1646, + "num_tokens": 21896372.0, + "reward": 1.0546875, + "reward_std": 0.8011799603700638, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.40625, + "rewards/format_reward/std": 0.49500229209661484, + "rewards/tag_count_reward/mean": 0.6484375, + "rewards/tag_count_reward/std": 0.3707296550273895, + "step": 627, + "token_counts/after_target": 604.75, + "token_counts/after_think": 237.75, + "token_counts/before_target": 2414.0, + "token_counts/before_think": 909.5 + }, + { + "avg_penalty/after_target": 2.132744252681732, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.525087982416153, + "avg_penalty/before_think": 0.5536752939224243, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 752.0, + "completions/max_terminated_length": 746.75, + "completions/mean_length": 359.796875, + "completions/mean_terminated_length": 334.99639892578125, + "completions/min_length": 70.5, + "completions/min_terminated_length": 70.5, + "epoch": 0.314, + "grad_norm": 10.036744117736816, + "kl": 14.6796875, + "learning_rate": 1.73491459514996e-05, + "loss": 1.5396, + "num_tokens": 21930967.0, + "reward": 1.06640625, + "reward_std": 0.9077146053314209, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.453125, + "rewards/format_reward/std": 0.47669370472431183, + "rewards/tag_count_reward/mean": 0.58203125, + "rewards/tag_count_reward/std": 0.4206446632742882, + "step": 628, + "token_counts/after_target": 1423.25, + "token_counts/after_think": 207.5, + "token_counts/before_target": 3367.25, + "token_counts/before_think": 758.75 + }, + { + "avg_penalty/after_target": 2.4603222012519836, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.4178633689880371, + "avg_penalty/before_think": 0.46287573128938675, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 792.25, + "completions/max_terminated_length": 674.75, + "completions/mean_length": 355.078125, + "completions/mean_terminated_length": 344.21771240234375, + "completions/min_length": 88.25, + "completions/min_terminated_length": 88.25, + "epoch": 0.3145, + "grad_norm": 7.4416093826293945, + "kl": 20.6875, + "learning_rate": 1.7337298645028764e-05, + "loss": 1.613, + "num_tokens": 21966172.0, + "reward": 0.84375, + "reward_std": 0.8014957308769226, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.3125, + "rewards/format_reward/std": 0.4704566150903702, + "rewards/tag_count_reward/mean": 0.53125, + "rewards/tag_count_reward/std": 0.4054366797208786, + "step": 629, + "token_counts/after_target": 1323.5, + "token_counts/after_think": 16.25, + "token_counts/before_target": 3496.0, + "token_counts/before_think": 845.5 + }, + { + "avg_penalty/after_target": 2.227928251028061, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.6009773388504982, + "avg_penalty/before_think": 0.6655276790261269, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 786.5, + "completions/max_terminated_length": 754.75, + "completions/mean_length": 390.0625, + "completions/mean_terminated_length": 361.7608184814453, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.315, + "grad_norm": 5.6121344566345215, + "kl": 27.15625, + "learning_rate": 1.732542898787379e-05, + "loss": 2.191, + "num_tokens": 22000672.0, + "reward": 0.84765625, + "reward_std": 0.8053568601608276, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.296875, + "rewards/format_reward/std": 0.45283494144678116, + "rewards/tag_count_reward/mean": 0.53515625, + "rewards/tag_count_reward/std": 0.3921998664736748, + "step": 630, + "token_counts/after_target": 2066.0, + "token_counts/after_think": 101.0, + "token_counts/before_target": 3619.5, + "token_counts/before_think": 454.5 + }, + { + "avg_penalty/after_target": 2.734377384185791, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.5045936331152916, + "avg_penalty/before_think": 0.2925497964024544, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 802.0, + "completions/max_terminated_length": 670.0, + "completions/mean_length": 296.375, + "completions/mean_terminated_length": 273.34375762939453, + "completions/min_length": 81.25, + "completions/min_terminated_length": 81.25, + "epoch": 0.3155, + "grad_norm": 5.224301815032959, + "kl": 23.625, + "learning_rate": 1.7313537016191706e-05, + "loss": 2.0943, + "num_tokens": 22030584.0, + "reward": 1.0625, + "reward_std": 0.9000023454427719, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.46875, + "rewards/format_reward/std": 0.5102732330560684, + "rewards/tag_count_reward/mean": 0.59375, + "rewards/tag_count_reward/std": 0.41414737701416016, + "step": 631, + "token_counts/after_target": 1340.75, + "token_counts/after_think": 136.75, + "token_counts/before_target": 2551.25, + "token_counts/before_think": 713.25 + }, + { + "avg_penalty/after_target": 2.8698703348636627, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.542325496673584, + "avg_penalty/before_think": 0.5011419281363487, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 838.75, + "completions/max_terminated_length": 652.5, + "completions/mean_length": 303.65625, + "completions/mean_terminated_length": 270.4729232788086, + "completions/min_length": 62.25, + "completions/min_terminated_length": 62.25, + "epoch": 0.316, + "grad_norm": 8.85979175567627, + "kl": 31.40625, + "learning_rate": 1.7301622766207526e-05, + "loss": 2.4004, + "num_tokens": 22059730.0, + "reward": 0.8359375, + "reward_std": 0.9912009090185165, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.11180340498685837, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.48935678601264954, + "rewards/tag_count_reward/mean": 0.4296875, + "rewards/tag_count_reward/std": 0.44913674890995026, + "step": 632, + "token_counts/after_target": 1495.75, + "token_counts/after_think": 82.25, + "token_counts/before_target": 2941.25, + "token_counts/before_think": 339.25 + }, + { + "avg_penalty/after_target": 2.5824639201164246, + "avg_penalty/after_think": 1.7753445506095886, + "avg_penalty/before_target": 0.4647973105311394, + "avg_penalty/before_think": 0.6177701763808727, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 924.75, + "completions/max_terminated_length": 699.25, + "completions/mean_length": 273.78125, + "completions/mean_terminated_length": 249.2395896911621, + "completions/min_length": 63.75, + "completions/min_terminated_length": 63.75, + "epoch": 0.3165, + "grad_norm": 7.163253307342529, + "kl": 31.3125, + "learning_rate": 1.7289686274214116e-05, + "loss": 2.4143, + "num_tokens": 22089044.0, + "reward": 0.83203125, + "reward_std": 0.8983806520700455, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4797805994749069, + "rewards/tag_count_reward/mean": 0.48828125, + "rewards/tag_count_reward/std": 0.461815781891346, + "step": 633, + "token_counts/after_target": 1344.0, + "token_counts/after_think": 35.0, + "token_counts/before_target": 2744.25, + "token_counts/before_think": 257.25 + }, + { + "avg_penalty/after_target": 2.124814808368683, + "avg_penalty/after_think": 0.9316971302032471, + "avg_penalty/before_target": 0.4825744479894638, + "avg_penalty/before_think": 0.39767903089523315, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.0, + "completions/max_terminated_length": 591.0, + "completions/mean_length": 227.65625, + "completions/mean_terminated_length": 227.65625, + "completions/min_length": 47.75, + "completions/min_terminated_length": 47.75, + "epoch": 0.317, + "grad_norm": 7.939286708831787, + "kl": 28.9375, + "learning_rate": 1.7277727576572108e-05, + "loss": 2.2065, + "num_tokens": 22114734.0, + "reward": 0.953125, + "reward_std": 0.9585051387548447, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.421875, + "rewards/format_reward/std": 0.500852182507515, + "rewards/tag_count_reward/mean": 0.515625, + "rewards/tag_count_reward/std": 0.45707816630601883, + "step": 634, + "token_counts/after_target": 917.75, + "token_counts/after_think": 5.5, + "token_counts/before_target": 2411.25, + "token_counts/before_think": 308.0 + }, + { + "avg_penalty/after_target": 1.7423043251037598, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.5113618224859238, + "avg_penalty/before_think": 0.5719339400529861, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 726.25, + "completions/max_terminated_length": 696.75, + "completions/mean_length": 226.140625, + "completions/mean_terminated_length": 214.1364631652832, + "completions/min_length": 49.5, + "completions/min_terminated_length": 49.5, + "epoch": 0.3175, + "grad_norm": 4.420633792877197, + "kl": 29.65625, + "learning_rate": 1.7265746709709762e-05, + "loss": 2.3759, + "num_tokens": 22139543.0, + "reward": 1.22265625, + "reward_std": 0.9531027227640152, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.5092606842517853, + "rewards/tag_count_reward/mean": 0.64453125, + "rewards/tag_count_reward/std": 0.45971710979938507, + "step": 635, + "token_counts/after_target": 982.75, + "token_counts/after_think": 20.75, + "token_counts/before_target": 2330.5, + "token_counts/before_think": 284.25 + }, + { + "avg_penalty/after_target": 2.538316309452057, + "avg_penalty/after_think": 0.8796483278274536, + "avg_penalty/before_target": 0.36458590626716614, + "avg_penalty/before_think": 0.3752991333603859, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 163.9375, + "completions/mean_terminated_length": 163.9375, + "completions/min_length": 45.5, + "completions/min_terminated_length": 45.5, + "epoch": 0.318, + "grad_norm": 10.876097679138184, + "kl": 21.875, + "learning_rate": 1.7253743710122877e-05, + "loss": 2.0678, + "num_tokens": 22160307.0, + "reward": 1.23046875, + "reward_std": 0.9250783771276474, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.49297719448804855, + "rewards/tag_count_reward/mean": 0.63671875, + "rewards/tag_count_reward/std": 0.4423914924263954, + "step": 636, + "token_counts/after_target": 571.0, + "token_counts/after_think": 3.5, + "token_counts/before_target": 1704.0, + "token_counts/before_think": 344.5 + }, + { + "avg_penalty/after_target": 2.9844915568828583, + "avg_penalty/after_think": 1.977950930595398, + "avg_penalty/before_target": 0.3066505528986454, + "avg_penalty/before_think": 0.31687548756599426, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.25, + "completions/max_terminated_length": 399.25, + "completions/mean_length": 164.015625, + "completions/mean_terminated_length": 164.015625, + "completions/min_length": 42.75, + "completions/min_terminated_length": 42.75, + "epoch": 0.3185, + "grad_norm": 10.747122764587402, + "kl": 16.48828125, + "learning_rate": 1.7241718614374678e-05, + "loss": 1.64, + "num_tokens": 22180260.0, + "reward": 1.41796875, + "reward_std": 0.9315594732761383, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.45916909724473953, + "rewards/tag_count_reward/mean": 0.69921875, + "rewards/tag_count_reward/std": 0.4328705817461014, + "step": 637, + "token_counts/after_target": 516.0, + "token_counts/after_think": 13.75, + "token_counts/before_target": 1734.25, + "token_counts/before_think": 360.25 + }, + { + "avg_penalty/after_target": 2.231768637895584, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.2612556517124176, + "avg_penalty/before_think": 0.4271375760436058, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.75, + "completions/max_terminated_length": 429.75, + "completions/mean_length": 136.03125, + "completions/mean_terminated_length": 136.03125, + "completions/min_length": 46.5, + "completions/min_terminated_length": 46.5, + "epoch": 0.319, + "grad_norm": 10.114116668701172, + "kl": 14.4765625, + "learning_rate": 1.7229671459095682e-05, + "loss": 1.5258, + "num_tokens": 22200870.0, + "reward": 1.55859375, + "reward_std": 0.7853414863348007, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4255262687802315, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.37244875729084015, + "step": 638, + "token_counts/after_target": 267.25, + "token_counts/after_think": 27.0, + "token_counts/before_target": 1484.5, + "token_counts/before_think": 397.75 + }, + { + "avg_penalty/after_target": 2.0468015372753143, + "avg_penalty/after_think": 2.9166367053985596, + "avg_penalty/before_target": 0.34691399335861206, + "avg_penalty/before_think": 0.400833860039711, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.25, + "completions/max_terminated_length": 480.25, + "completions/mean_length": 143.03125, + "completions/mean_terminated_length": 143.03125, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.3195, + "grad_norm": 15.480706214904785, + "kl": 20.8125, + "learning_rate": 1.7217602280983622e-05, + "loss": 2.1138, + "num_tokens": 22220072.0, + "reward": 1.48046875, + "reward_std": 0.8645354509353638, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4604102149605751, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.4158061668276787, + "step": 639, + "token_counts/after_target": 414.75, + "token_counts/after_think": 11.75, + "token_counts/before_target": 1516.0, + "token_counts/before_think": 346.0 + }, + { + "avg_penalty/after_target": 1.976994276046753, + "avg_penalty/after_think": 3.741288125514984, + "avg_penalty/before_target": 0.31322673708200455, + "avg_penalty/before_think": 0.4273047223687172, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.25, + "completions/max_terminated_length": 465.25, + "completions/mean_length": 153.25, + "completions/mean_terminated_length": 153.25, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.32, + "grad_norm": 5.685085773468018, + "kl": 17.5, + "learning_rate": 1.7205511116803306e-05, + "loss": 1.5933, + "num_tokens": 22238968.0, + "reward": 1.5078125, + "reward_std": 0.8021180927753448, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4339347705245018, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.3897322565317154, + "step": 640, + "token_counts/after_target": 290.5, + "token_counts/after_think": 18.5, + "token_counts/before_target": 1774.0, + "token_counts/before_think": 369.0 + }, + { + "avg_penalty/after_target": 1.9998909831047058, + "avg_penalty/after_think": 2.755474328994751, + "avg_penalty/before_target": 0.3034573458135128, + "avg_penalty/before_think": 0.5640792027115822, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.75, + "completions/max_terminated_length": 498.75, + "completions/mean_length": 130.40625, + "completions/mean_terminated_length": 130.40625, + "completions/min_length": 27.75, + "completions/min_terminated_length": 27.75, + "epoch": 0.3205, + "grad_norm": 5.5508036613464355, + "kl": 24.375, + "learning_rate": 1.7193398003386514e-05, + "loss": 2.2162, + "num_tokens": 22257874.0, + "reward": 1.54296875, + "reward_std": 0.7207634299993515, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.3777689263224602, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.3719623014330864, + "step": 641, + "token_counts/after_target": 259.25, + "token_counts/after_think": 8.75, + "token_counts/before_target": 1569.0, + "token_counts/before_think": 249.5 + }, + { + "avg_penalty/after_target": 2.953742802143097, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.6135724112391472, + "avg_penalty/before_think": 0.5215130224823952, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 806.0, + "completions/max_terminated_length": 806.0, + "completions/mean_length": 219.703125, + "completions/mean_terminated_length": 219.703125, + "completions/min_length": 24.25, + "completions/min_terminated_length": 24.25, + "epoch": 0.321, + "grad_norm": 10.34109878540039, + "kl": 26.8125, + "learning_rate": 1.718126297763189e-05, + "loss": 2.4922, + "num_tokens": 22281503.0, + "reward": 1.0625, + "reward_std": 0.8040417730808258, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.453125, + "rewards/format_reward/std": 0.49244368076324463, + "rewards/tag_count_reward/mean": 0.609375, + "rewards/tag_count_reward/std": 0.3829464763402939, + "step": 642, + "token_counts/after_target": 1164.5, + "token_counts/after_think": 16.5, + "token_counts/before_target": 2073.0, + "token_counts/before_think": 261.25 + }, + { + "avg_penalty/after_target": 2.961923122406006, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 0.2632131353020668, + "avg_penalty/before_think": 0.6360194459557533, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.25, + "completions/max_terminated_length": 500.25, + "completions/mean_length": 191.234375, + "completions/mean_terminated_length": 191.234375, + "completions/min_length": 17.25, + "completions/min_terminated_length": 17.25, + "epoch": 0.3215, + "grad_norm": 58.031707763671875, + "kl": 52.125, + "learning_rate": 1.716910607650483e-05, + "loss": 2.7327, + "num_tokens": 22306398.0, + "reward": 0.4453125, + "reward_std": 0.3727082647383213, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.4140625, + "rewards/tag_count_reward/std": 0.2861243970692158, + "step": 643, + "token_counts/after_target": 540.0, + "token_counts/after_think": 0.0, + "token_counts/before_target": 2318.25, + "token_counts/before_think": 201.5 + }, + { + "avg_penalty/after_target": 2.9338656067848206, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 0.4616374783217907, + "avg_penalty/before_think": 0.7547727338969707, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.5, + "completions/max_terminated_length": 514.5, + "completions/mean_length": 202.1875, + "completions/mean_terminated_length": 202.1875, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.322, + "grad_norm": 58.356040954589844, + "kl": 63.3125, + "learning_rate": 1.715692733703736e-05, + "loss": 3.6501, + "num_tokens": 22329674.0, + "reward": 0.390625, + "reward_std": 0.4577757641673088, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.1632782220840454, + "rewards/tag_count_reward/mean": 0.328125, + "rewards/tag_count_reward/std": 0.3216659054160118, + "step": 644, + "token_counts/after_target": 1060.75, + "token_counts/after_think": 0.0, + "token_counts/before_target": 2004.5, + "token_counts/before_think": 169.75 + }, + { + "avg_penalty/after_target": 2.3415169715881348, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 0.29311375692486763, + "avg_penalty/before_think": 0.44297057017683983, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.75, + "completions/max_terminated_length": 553.75, + "completions/mean_length": 182.5625, + "completions/mean_terminated_length": 182.5625, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.3225, + "grad_norm": 74.17899322509766, + "kl": 62.5, + "learning_rate": 1.7144726796328034e-05, + "loss": 3.0606, + "num_tokens": 22352894.0, + "reward": 0.3359375, + "reward_std": 0.3185105323791504, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.3203125, + "rewards/tag_count_reward/std": 0.2834039553999901, + "step": 645, + "token_counts/after_target": 502.0, + "token_counts/after_think": 0.0, + "token_counts/before_target": 2329.25, + "token_counts/before_think": 89.75 + }, + { + "avg_penalty/after_target": 3.018020749092102, + "avg_penalty/after_think": 0.8498560190200806, + "avg_penalty/before_target": 0.3575940318405628, + "avg_penalty/before_think": 0.2535870149731636, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.75, + "completions/max_terminated_length": 559.75, + "completions/mean_length": 149.734375, + "completions/mean_terminated_length": 149.734375, + "completions/min_length": 9.25, + "completions/min_terminated_length": 9.25, + "epoch": 0.323, + "grad_norm": 54.49995040893555, + "kl": 58.625, + "learning_rate": 1.713250449154182e-05, + "loss": 3.2374, + "num_tokens": 22373453.0, + "reward": 0.41796875, + "reward_std": 0.3422318920493126, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.38671875, + "rewards/tag_count_reward/std": 0.2590267024934292, + "step": 646, + "token_counts/after_target": 650.75, + "token_counts/after_think": 3.25, + "token_counts/before_target": 1681.5, + "token_counts/before_think": 60.25 + }, + { + "avg_penalty/after_target": 1.9042473435401917, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 0.3219135105609894, + "avg_penalty/before_think": 0.5595711022615433, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 199.21875, + "completions/mean_terminated_length": 199.21875, + "completions/min_length": 9.5, + "completions/min_terminated_length": 9.5, + "epoch": 0.3235, + "grad_norm": 60.710147857666016, + "kl": 55.9375, + "learning_rate": 1.712026045990997e-05, + "loss": 2.8915, + "num_tokens": 22397915.0, + "reward": 0.375, + "reward_std": 0.37708553671836853, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.34375, + "rewards/tag_count_reward/std": 0.28139448165893555, + "step": 647, + "token_counts/after_target": 533.0, + "token_counts/after_think": 0.0, + "token_counts/before_target": 2341.75, + "token_counts/before_think": 312.75 + }, + { + "avg_penalty/after_target": 2.4885116815567017, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.42858193814754486, + "avg_penalty/before_think": 1.1325813084840775, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 658.0, + "completions/max_terminated_length": 532.0, + "completions/mean_length": 205.140625, + "completions/mean_terminated_length": 192.26979446411133, + "completions/min_length": 9.5, + "completions/min_terminated_length": 9.5, + "epoch": 0.324, + "grad_norm": 37.992950439453125, + "kl": 46.0, + "learning_rate": 1.710799473872993e-05, + "loss": 2.8827, + "num_tokens": 22420708.0, + "reward": 0.4296875, + "reward_std": 0.37228310108184814, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.14789126068353653, + "rewards/tag_count_reward/mean": 0.3828125, + "rewards/tag_count_reward/std": 0.2479502409696579, + "step": 648, + "token_counts/after_target": 850.5, + "token_counts/after_think": 84.75, + "token_counts/before_target": 2217.25, + "token_counts/before_think": 129.75 + }, + { + "avg_penalty/after_target": 2.205944299697876, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 0.3352597691118717, + "avg_penalty/before_think": 0.27406566962599754, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 554.5, + "completions/max_terminated_length": 554.5, + "completions/mean_length": 171.8125, + "completions/mean_terminated_length": 171.8125, + "completions/min_length": 8.5, + "completions/min_terminated_length": 8.5, + "epoch": 0.3245, + "grad_norm": 30.507497787475586, + "kl": 36.9375, + "learning_rate": 1.709570736536521e-05, + "loss": 2.2122, + "num_tokens": 22445656.0, + "reward": 0.48046875, + "reward_std": 0.37880364060401917, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.11180340498685837, + "rewards/tag_count_reward/mean": 0.41796875, + "rewards/tag_count_reward/std": 0.278156079351902, + "step": 649, + "token_counts/after_target": 453.25, + "token_counts/after_think": 0.0, + "token_counts/before_target": 2235.5, + "token_counts/before_think": 60.25 + }, + { + "avg_penalty/after_target": 1.9982945024967194, + "avg_penalty/after_think": 0.29530349373817444, + "avg_penalty/before_target": 0.3616308346390724, + "avg_penalty/before_think": 0.3598184324800968, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 776.0, + "completions/max_terminated_length": 692.75, + "completions/mean_length": 248.734375, + "completions/mean_terminated_length": 236.82500457763672, + "completions/min_length": 16.5, + "completions/min_terminated_length": 16.5, + "epoch": 0.325, + "grad_norm": 18.315082550048828, + "kl": 27.21875, + "learning_rate": 1.708339837724529e-05, + "loss": 1.8, + "num_tokens": 22470359.0, + "reward": 0.44140625, + "reward_std": 0.27901691198349, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.0625, + "rewards/tag_count_reward/mean": 0.42578125, + "rewards/tag_count_reward/std": 0.24045421555638313, + "step": 650, + "token_counts/after_target": 697.0, + "token_counts/after_think": 25.5, + "token_counts/before_target": 2682.25, + "token_counts/before_think": 575.0 + }, + { + "avg_penalty/after_target": 2.76511150598526, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 0.4672253355383873, + "avg_penalty/before_think": 0.15192074701189995, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 910.75, + "completions/max_terminated_length": 771.5, + "completions/mean_length": 313.078125, + "completions/mean_terminated_length": 290.8479232788086, + "completions/min_length": 15.5, + "completions/min_terminated_length": 15.5, + "epoch": 0.3255, + "grad_norm": 5.052567005157471, + "kl": 24.46875, + "learning_rate": 1.7071067811865477e-05, + "loss": 2.1048, + "num_tokens": 22498956.0, + "reward": 0.234375, + "reward_std": 0.2702687308192253, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.234375, + "rewards/tag_count_reward/std": 0.2702687382698059, + "step": 651, + "token_counts/after_target": 1792.25, + "token_counts/after_think": 0.0, + "token_counts/before_target": 3173.5, + "token_counts/before_think": 43.5 + }, + { + "avg_penalty/after_target": 2.2404658496379852, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 0.39274856448173523, + "avg_penalty/before_think": 0.3876339793205261, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 823.5, + "completions/max_terminated_length": 736.25, + "completions/mean_length": 212.71875, + "completions/mean_terminated_length": 200.08958435058594, + "completions/min_length": 25.25, + "completions/min_terminated_length": 25.25, + "epoch": 0.326, + "grad_norm": 10.715057373046875, + "kl": 12.984375, + "learning_rate": 1.7058715706786813e-05, + "loss": 1.3741, + "num_tokens": 22520986.0, + "reward": 0.35546875, + "reward_std": 0.24561287090182304, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.35546875, + "rewards/tag_count_reward/std": 0.24561287835240364, + "step": 652, + "token_counts/after_target": 788.75, + "token_counts/after_think": 0.0, + "token_counts/before_target": 2447.0, + "token_counts/before_think": 167.75 + }, + { + "avg_penalty/after_target": 2.3470607101917267, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 0.44726432859897614, + "avg_penalty/before_think": 0.539095152169466, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 706.75, + "completions/max_terminated_length": 699.0, + "completions/mean_length": 312.59375, + "completions/mean_terminated_length": 302.3093795776367, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.3265, + "grad_norm": 10.500871658325195, + "kl": 11.8125, + "learning_rate": 1.7046342099635948e-05, + "loss": 1.2157, + "num_tokens": 22552464.0, + "reward": 0.33984375, + "reward_std": 0.27946022897958755, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.32421875, + "rewards/tag_count_reward/std": 0.25642941519618034, + "step": 653, + "token_counts/after_target": 1335.25, + "token_counts/after_think": 0.0, + "token_counts/before_target": 3539.5, + "token_counts/before_think": 126.75 + }, + { + "avg_penalty/after_target": 2.922937035560608, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.4582997299730778, + "avg_penalty/before_think": 0.10120362788438797, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 789.0, + "completions/max_terminated_length": 767.5, + "completions/mean_length": 284.0, + "completions/mean_terminated_length": 272.5510482788086, + "completions/min_length": 21.25, + "completions/min_terminated_length": 21.25, + "epoch": 0.327, + "grad_norm": 56.56250762939453, + "kl": 10.4375, + "learning_rate": 1.703394702810504e-05, + "loss": 1.4089, + "num_tokens": 22582944.0, + "reward": 0.53125, + "reward_std": 0.2913864143192768, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.515625, + "rewards/tag_count_reward/std": 0.2695344053208828, + "step": 654, + "token_counts/after_target": 1195.25, + "token_counts/after_think": 8.25, + "token_counts/before_target": 3236.5, + "token_counts/before_think": 104.0 + }, + { + "avg_penalty/after_target": 2.0881146788597107, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.4343517944216728, + "avg_penalty/before_think": 1.3149083256721497, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 699.5, + "completions/max_terminated_length": 699.5, + "completions/mean_length": 313.015625, + "completions/mean_terminated_length": 313.015625, + "completions/min_length": 18.5, + "completions/min_terminated_length": 18.5, + "epoch": 0.3275, + "grad_norm": 15.155557632446289, + "kl": 11.890625, + "learning_rate": 1.7021530529951627e-05, + "loss": 1.3817, + "num_tokens": 22610241.0, + "reward": 0.45703125, + "reward_std": 0.2974031791090965, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.45703125, + "rewards/tag_count_reward/std": 0.2974031940102577, + "step": 655, + "token_counts/after_target": 1315.75, + "token_counts/after_think": 0.5, + "token_counts/before_target": 3534.25, + "token_counts/before_think": 157.75 + }, + { + "avg_penalty/after_target": 2.8182762563228607, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.3330218829214573, + "avg_penalty/before_think": 0.4348236843943596, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.25, + "completions/max_terminated_length": 549.25, + "completions/mean_length": 252.65625, + "completions/mean_terminated_length": 252.65625, + "completions/min_length": 14.25, + "completions/min_terminated_length": 14.25, + "epoch": 0.328, + "grad_norm": 6.912746906280518, + "kl": 13.71875, + "learning_rate": 1.700909264299851e-05, + "loss": 1.2426, + "num_tokens": 22639547.0, + "reward": 0.46875, + "reward_std": 0.304520882666111, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.46875, + "rewards/tag_count_reward/std": 0.304520882666111, + "step": 656, + "token_counts/after_target": 778.5, + "token_counts/after_think": 36.25, + "token_counts/before_target": 3061.5, + "token_counts/before_think": 166.25 + }, + { + "avg_penalty/after_target": 2.552520513534546, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.46747493743896484, + "avg_penalty/before_think": 0.508928757160902, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 816.25, + "completions/max_terminated_length": 764.75, + "completions/mean_length": 328.234375, + "completions/mean_terminated_length": 316.77396392822266, + "completions/min_length": 21.75, + "completions/min_terminated_length": 21.75, + "epoch": 0.3285, + "grad_norm": 19.011919021606445, + "kl": 12.34375, + "learning_rate": 1.6996633405133656e-05, + "loss": 1.5995, + "num_tokens": 22670394.0, + "reward": 0.55078125, + "reward_std": 0.3108243867754936, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.55078125, + "rewards/tag_count_reward/std": 0.3108243867754936, + "step": 657, + "token_counts/after_target": 1898.0, + "token_counts/after_think": 131.5, + "token_counts/before_target": 3209.0, + "token_counts/before_think": 13.25 + }, + { + "avg_penalty/after_target": 1.769548624753952, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.49872176349163055, + "avg_penalty/before_think": 1.3799320757389069, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.0, + "completions/max_terminated_length": 764.0, + "completions/mean_length": 331.65625, + "completions/mean_terminated_length": 331.65625, + "completions/min_length": 67.25, + "completions/min_terminated_length": 67.25, + "epoch": 0.329, + "grad_norm": 11.626755714416504, + "kl": 12.171875, + "learning_rate": 1.6984152854310063e-05, + "loss": 1.4229, + "num_tokens": 22702708.0, + "reward": 0.5703125, + "reward_std": 0.26098649203777313, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5703125, + "rewards/tag_count_reward/std": 0.26098649203777313, + "step": 658, + "token_counts/after_target": 1326.5, + "token_counts/after_think": 83.5, + "token_counts/before_target": 3733.0, + "token_counts/before_think": 163.5 + }, + { + "avg_penalty/after_target": 2.089748591184616, + "avg_penalty/after_think": 2.946645498275757, + "avg_penalty/before_target": 0.48989221453666687, + "avg_penalty/before_think": 0.9823606833815575, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 834.5, + "completions/max_terminated_length": 665.75, + "completions/mean_length": 306.34375, + "completions/mean_terminated_length": 282.1927146911621, + "completions/min_length": 18.25, + "completions/min_terminated_length": 18.25, + "epoch": 0.3295, + "grad_norm": 8.599313735961914, + "kl": 16.21875, + "learning_rate": 1.697165102854565e-05, + "loss": 1.6313, + "num_tokens": 22731882.0, + "reward": 0.609375, + "reward_std": 0.31769200414419174, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.59375, + "rewards/tag_count_reward/std": 0.2975464388728142, + "step": 659, + "token_counts/after_target": 1321.75, + "token_counts/after_think": 106.75, + "token_counts/before_target": 3250.0, + "token_counts/before_think": 223.0 + }, + { + "avg_penalty/after_target": 2.30499404668808, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 0.5724720135331154, + "avg_penalty/before_think": 0.891909833997488, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 722.0, + "completions/max_terminated_length": 722.0, + "completions/mean_length": 257.078125, + "completions/mean_terminated_length": 257.078125, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.33, + "grad_norm": 3.011566400527954, + "kl": 19.375, + "learning_rate": 1.6959127965923144e-05, + "loss": 1.7657, + "num_tokens": 22758591.0, + "reward": 0.6171875, + "reward_std": 0.2980586551129818, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6015625, + "rewards/tag_count_reward/std": 0.3009691908955574, + "step": 660, + "token_counts/after_target": 1340.0, + "token_counts/after_think": 0.0, + "token_counts/before_target": 2268.5, + "token_counts/before_think": 504.75 + }, + { + "avg_penalty/after_target": 2.0856531262397766, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 0.41677088290452957, + "avg_penalty/before_think": 0.1514386273920536, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.75, + "completions/max_terminated_length": 582.75, + "completions/mean_length": 248.421875, + "completions/mean_terminated_length": 248.421875, + "completions/min_length": 12.5, + "completions/min_terminated_length": 12.5, + "epoch": 0.3305, + "grad_norm": 15.633891105651855, + "kl": 24.625, + "learning_rate": 1.6946583704589973e-05, + "loss": 1.6847, + "num_tokens": 22782954.0, + "reward": 0.53125, + "reward_std": 0.31293679773807526, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.53125, + "rewards/tag_count_reward/std": 0.31293680518865585, + "step": 661, + "token_counts/after_target": 1028.0, + "token_counts/after_think": 0.0, + "token_counts/before_target": 2856.25, + "token_counts/before_think": 90.5 + }, + { + "avg_penalty/after_target": 2.721025824546814, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 0.3639316111803055, + "avg_penalty/before_think": 0.10067075118422508, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 654.5, + "completions/max_terminated_length": 654.5, + "completions/mean_length": 301.015625, + "completions/mean_terminated_length": 301.015625, + "completions/min_length": 29.25, + "completions/min_terminated_length": 29.25, + "epoch": 0.331, + "grad_norm": 9.534345626831055, + "kl": 26.875, + "learning_rate": 1.693401828275813e-05, + "loss": 2.0311, + "num_tokens": 22814315.0, + "reward": 0.6015625, + "reward_std": 0.2763550356030464, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.6015625, + "rewards/tag_count_reward/std": 0.276355043053627, + "step": 662, + "token_counts/after_target": 1327.75, + "token_counts/after_think": 0.0, + "token_counts/before_target": 3486.25, + "token_counts/before_think": 2.25 + }, + { + "avg_penalty/after_target": 2.3041435182094574, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 0.5170420035719872, + "avg_penalty/before_think": 0.4607800170779228, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 914.75, + "completions/max_terminated_length": 838.75, + "completions/mean_length": 312.3125, + "completions/mean_terminated_length": 290.2697982788086, + "completions/min_length": 23.5, + "completions/min_terminated_length": 23.5, + "epoch": 0.3315, + "grad_norm": 7.902656078338623, + "kl": 28.34375, + "learning_rate": 1.692143173870407e-05, + "loss": 2.2596, + "num_tokens": 22846095.0, + "reward": 0.5390625, + "reward_std": 0.31131184846162796, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.5390625, + "rewards/tag_count_reward/std": 0.31131185591220856, + "step": 663, + "token_counts/after_target": 1489.5, + "token_counts/after_think": 0.0, + "token_counts/before_target": 3343.25, + "token_counts/before_think": 164.25 + }, + { + "avg_penalty/after_target": 2.7343491911888123, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 0.47684262692928314, + "avg_penalty/before_think": 1.397388443350792, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.75, + "completions/max_terminated_length": 686.75, + "completions/mean_length": 293.859375, + "completions/mean_terminated_length": 293.859375, + "completions/min_length": 17.5, + "completions/min_terminated_length": 17.5, + "epoch": 0.332, + "grad_norm": 3.7194836139678955, + "kl": 19.53125, + "learning_rate": 1.6908824110768584e-05, + "loss": 1.7082, + "num_tokens": 22874374.0, + "reward": 0.65234375, + "reward_std": 0.2264837734401226, + "rewards/accuracy_reward/mean": NaN, + "rewards/accuracy_reward/std": NaN, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/tag_count_reward/mean": 0.65234375, + "rewards/tag_count_reward/std": 0.2264837808907032, + "step": 664, + "token_counts/after_target": 1205.25, + "token_counts/after_think": 0.0, + "token_counts/before_target": 3344.0, + "token_counts/before_think": 152.5 + }, + { + "avg_penalty/after_target": 1.9300937354564667, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3961704671382904, + "avg_penalty/before_think": 1.0134921222925186, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.25, + "completions/max_terminated_length": 668.25, + "completions/mean_length": 308.84375, + "completions/mean_terminated_length": 308.84375, + "completions/min_length": 14.75, + "completions/min_terminated_length": 14.75, + "epoch": 0.3325, + "grad_norm": 3.436603546142578, + "kl": 17.84375, + "learning_rate": 1.68961954373567e-05, + "loss": 1.5628, + "num_tokens": 22906700.0, + "reward": 1.19140625, + "reward_std": 0.7620777934789658, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.453125, + "rewards/format_reward/std": 0.5112857818603516, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.3402147740125656, + "step": 665, + "token_counts/after_target": 802.0, + "token_counts/after_think": 412.75, + "token_counts/before_target": 2713.75, + "token_counts/before_think": 1013.0 + }, + { + "avg_penalty/after_target": 2.179317146539688, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.32877932488918304, + "avg_penalty/before_think": 0.44039614871144295, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.75, + "completions/max_terminated_length": 565.75, + "completions/mean_length": 229.03125, + "completions/mean_terminated_length": 229.03125, + "completions/min_length": 47.25, + "completions/min_terminated_length": 47.25, + "epoch": 0.333, + "grad_norm": 4.772176265716553, + "kl": 16.234375, + "learning_rate": 1.688354575693754e-05, + "loss": 1.3939, + "num_tokens": 22931790.0, + "reward": 1.21875, + "reward_std": 0.7952000498771667, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.46875, + "rewards/format_reward/std": 0.5102732330560684, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.37380290776491165, + "step": 666, + "token_counts/after_target": 497.25, + "token_counts/after_think": 28.0, + "token_counts/before_target": 2592.25, + "token_counts/before_think": 547.0 + }, + { + "avg_penalty/after_target": 2.472700208425522, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.38636477664113045, + "avg_penalty/before_think": 0.5201167464256287, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.75, + "completions/max_terminated_length": 569.75, + "completions/mean_length": 220.296875, + "completions/mean_terminated_length": 220.296875, + "completions/min_length": 29.25, + "completions/min_terminated_length": 29.25, + "epoch": 0.3335, + "grad_norm": 12.070721626281738, + "kl": 13.96875, + "learning_rate": 1.6870875108044233e-05, + "loss": 1.5108, + "num_tokens": 22955809.0, + "reward": 1.3984375, + "reward_std": 0.8071436733007431, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.49500229209661484, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.3326682224869728, + "step": 667, + "token_counts/after_target": 726.0, + "token_counts/after_think": 15.0, + "token_counts/before_target": 2094.25, + "token_counts/before_think": 689.5 + }, + { + "avg_penalty/after_target": 2.958901733160019, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.2893561013042927, + "avg_penalty/before_think": 0.3598693981766701, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 595.5, + "completions/max_terminated_length": 595.5, + "completions/mean_length": 221.171875, + "completions/mean_terminated_length": 221.171875, + "completions/min_length": 50.5, + "completions/min_terminated_length": 50.5, + "epoch": 0.334, + "grad_norm": 12.960554122924805, + "kl": 16.34375, + "learning_rate": 1.6858183529273766e-05, + "loss": 1.6986, + "num_tokens": 22978876.0, + "reward": 1.3125, + "reward_std": 0.7683017700910568, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.48989029973745346, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.3583788052201271, + "step": 668, + "token_counts/after_target": 795.5, + "token_counts/after_think": 27.75, + "token_counts/before_target": 2110.0, + "token_counts/before_think": 605.5 + }, + { + "avg_penalty/after_target": 2.6737635135650635, + "avg_penalty/after_think": 1.9352160692214966, + "avg_penalty/before_target": 0.29628076776862144, + "avg_penalty/before_think": 0.36447352543473244, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.75, + "completions/max_terminated_length": 483.75, + "completions/mean_length": 186.59375, + "completions/mean_terminated_length": 186.59375, + "completions/min_length": 56.75, + "completions/min_terminated_length": 56.75, + "epoch": 0.3345, + "grad_norm": 15.741057395935059, + "kl": 13.34375, + "learning_rate": 1.684547105928689e-05, + "loss": 1.4509, + "num_tokens": 23001186.0, + "reward": 1.48828125, + "reward_std": 0.7216243147850037, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.48558124154806137, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.2895554341375828, + "step": 669, + "token_counts/after_target": 495.25, + "token_counts/after_think": 34.25, + "token_counts/before_target": 1994.0, + "token_counts/before_think": 462.0 + }, + { + "avg_penalty/after_target": 2.438200891017914, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.36954163014888763, + "avg_penalty/before_think": 0.5714595019817352, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 650.75, + "completions/max_terminated_length": 490.75, + "completions/mean_length": 210.65625, + "completions/mean_terminated_length": 197.76458740234375, + "completions/min_length": 71.5, + "completions/min_terminated_length": 71.5, + "epoch": 0.335, + "grad_norm": 4.913022041320801, + "kl": 20.5625, + "learning_rate": 1.6832737736807994e-05, + "loss": 1.7646, + "num_tokens": 23025372.0, + "reward": 1.2421875, + "reward_std": 0.8012042045593262, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5018647313117981, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.361223541200161, + "step": 670, + "token_counts/after_target": 643.5, + "token_counts/after_think": 83.75, + "token_counts/before_target": 2177.25, + "token_counts/before_think": 466.0 + }, + { + "avg_penalty/after_target": 2.410776048898697, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.512827180325985, + "avg_penalty/before_think": 0.6271142587065697, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.25, + "completions/max_terminated_length": 642.25, + "completions/mean_length": 192.796875, + "completions/mean_terminated_length": 192.796875, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.3355, + "grad_norm": 4.77422571182251, + "kl": 25.875, + "learning_rate": 1.6819983600624986e-05, + "loss": 2.2912, + "num_tokens": 23046383.0, + "reward": 1.31640625, + "reward_std": 0.8866871744394302, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.500852182507515, + "rewards/tag_count_reward/mean": 0.70703125, + "rewards/tag_count_reward/std": 0.410528227686882, + "step": 671, + "token_counts/after_target": 666.5, + "token_counts/after_think": 21.25, + "token_counts/before_target": 2056.25, + "token_counts/before_think": 340.75 + }, + { + "avg_penalty/after_target": 2.3086321651935577, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.46832844614982605, + "avg_penalty/before_think": 0.3884919285774231, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 752.25, + "completions/max_terminated_length": 752.25, + "completions/mean_length": 255.15625, + "completions/mean_terminated_length": 255.15625, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.336, + "grad_norm": 8.414010047912598, + "kl": 35.625, + "learning_rate": 1.680720868958918e-05, + "loss": 2.7579, + "num_tokens": 23069817.0, + "reward": 1.2890625, + "reward_std": 0.8453326970338821, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.48605145514011383, + "rewards/tag_count_reward/mean": 0.7109375, + "rewards/tag_count_reward/std": 0.3898617923259735, + "step": 672, + "token_counts/after_target": 1072.5, + "token_counts/after_think": 24.75, + "token_counts/before_target": 2483.75, + "token_counts/before_think": 501.5 + }, + { + "avg_penalty/after_target": 2.760430335998535, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3305617794394493, + "avg_penalty/before_think": 0.35305652022361755, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.5, + "completions/max_terminated_length": 496.5, + "completions/mean_length": 234.4375, + "completions/mean_terminated_length": 234.4375, + "completions/min_length": 73.75, + "completions/min_terminated_length": 73.75, + "epoch": 0.3365, + "grad_norm": 21.87400245666504, + "kl": 36.21875, + "learning_rate": 1.6794413042615168e-05, + "loss": 2.403, + "num_tokens": 23094453.0, + "reward": 1.046875, + "reward_std": 0.9614248424768448, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.484375, + "rewards/format_reward/std": 0.5133601278066635, + "rewards/tag_count_reward/mean": 0.5625, + "rewards/tag_count_reward/std": 0.47132542729377747, + "step": 673, + "token_counts/after_target": 728.0, + "token_counts/after_think": 16.5, + "token_counts/before_target": 2499.25, + "token_counts/before_think": 507.25 + }, + { + "avg_penalty/after_target": 2.6619282364845276, + "avg_penalty/after_think": 0.5788717269897461, + "avg_penalty/before_target": 0.5110260993242264, + "avg_penalty/before_think": 0.380806639790535, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 686.75, + "completions/max_terminated_length": 680.25, + "completions/mean_length": 310.421875, + "completions/mean_terminated_length": 300.1375045776367, + "completions/min_length": 84.25, + "completions/min_terminated_length": 84.25, + "epoch": 0.337, + "grad_norm": 20.043455123901367, + "kl": 43.1875, + "learning_rate": 1.6781596698680708e-05, + "loss": 3.0634, + "num_tokens": 23125968.0, + "reward": 1.0, + "reward_std": 0.9729802161455154, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.46875, + "rewards/format_reward/std": 0.5122983306646347, + "rewards/tag_count_reward/mean": 0.53125, + "rewards/tag_count_reward/std": 0.47663552314043045, + "step": 674, + "token_counts/after_target": 1553.25, + "token_counts/after_think": 3.75, + "token_counts/before_target": 2996.0, + "token_counts/before_think": 413.75 + }, + { + "avg_penalty/after_target": 2.7672933638095856, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.38294683024287224, + "avg_penalty/before_think": 0.4335237257182598, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.5, + "completions/max_terminated_length": 611.5, + "completions/mean_length": 259.203125, + "completions/mean_terminated_length": 259.203125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.3375, + "grad_norm": 24.543724060058594, + "kl": 39.875, + "learning_rate": 1.6768759696826608e-05, + "loss": 2.6169, + "num_tokens": 23151517.0, + "reward": 0.91796875, + "reward_std": 0.9616102427244186, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.4375, + "rewards/format_reward/std": 0.49345622956752777, + "rewards/tag_count_reward/mean": 0.48046875, + "rewards/tag_count_reward/std": 0.47698284685611725, + "step": 675, + "token_counts/after_target": 929.5, + "token_counts/after_think": 44.75, + "token_counts/before_target": 2741.25, + "token_counts/before_think": 431.75 + }, + { + "avg_penalty/after_target": 2.746230572462082, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.325839813798666, + "avg_penalty/before_think": 0.4097164608538151, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 227.8125, + "completions/mean_terminated_length": 227.8125, + "completions/min_length": 84.75, + "completions/min_terminated_length": 84.75, + "epoch": 0.338, + "grad_norm": 10.846861839294434, + "kl": 31.78125, + "learning_rate": 1.6755902076156606e-05, + "loss": 2.2444, + "num_tokens": 23175441.0, + "reward": 1.125, + "reward_std": 0.9919146746397018, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.5112857818603516, + "rewards/tag_count_reward/mean": 0.578125, + "rewards/tag_count_reward/std": 0.4860072731971741, + "step": 676, + "token_counts/after_target": 766.75, + "token_counts/after_think": 7.0, + "token_counts/before_target": 2484.25, + "token_counts/before_think": 387.0 + }, + { + "avg_penalty/after_target": 2.589148074388504, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3612017147243023, + "avg_penalty/before_think": 0.5099109187722206, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.25, + "completions/max_terminated_length": 464.25, + "completions/mean_length": 224.890625, + "completions/mean_terminated_length": 224.890625, + "completions/min_length": 72.25, + "completions/min_terminated_length": 72.25, + "epoch": 0.3385, + "grad_norm": 13.733417510986328, + "kl": 30.25, + "learning_rate": 1.6743023875837233e-05, + "loss": 2.1442, + "num_tokens": 23199018.0, + "reward": 0.8828125, + "reward_std": 0.9783841818571091, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.421875, + "rewards/format_reward/std": 0.5028772801160812, + "rewards/tag_count_reward/mean": 0.4453125, + "rewards/tag_count_reward/std": 0.4850275442004204, + "step": 677, + "token_counts/after_target": 739.5, + "token_counts/after_think": 28.5, + "token_counts/before_target": 2576.75, + "token_counts/before_think": 253.5 + }, + { + "avg_penalty/after_target": 2.706253170967102, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.34601425379514694, + "avg_penalty/before_think": 0.3784947618842125, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.75, + "completions/max_terminated_length": 422.75, + "completions/mean_length": 192.046875, + "completions/mean_terminated_length": 192.046875, + "completions/min_length": 58.5, + "completions/min_terminated_length": 58.5, + "epoch": 0.339, + "grad_norm": 3.4053826332092285, + "kl": 24.84375, + "learning_rate": 1.6730125135097736e-05, + "loss": 1.9902, + "num_tokens": 23222045.0, + "reward": 1.015625, + "reward_std": 1.0119883716106415, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5163977742195129, + "rewards/tag_count_reward/mean": 0.515625, + "rewards/tag_count_reward/std": 0.49920351803302765, + "step": 678, + "token_counts/after_target": 513.75, + "token_counts/after_think": 32.25, + "token_counts/before_target": 2189.5, + "token_counts/before_think": 337.25 + }, + { + "avg_penalty/after_target": 2.8018308877944946, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.36662687361240387, + "avg_penalty/before_think": 0.3073377199470997, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.25, + "completions/max_terminated_length": 535.25, + "completions/mean_length": 235.09375, + "completions/mean_terminated_length": 235.09375, + "completions/min_length": 70.75, + "completions/min_terminated_length": 70.75, + "epoch": 0.3395, + "grad_norm": 10.167040824890137, + "kl": 19.078125, + "learning_rate": 1.6717205893229904e-05, + "loss": 1.8055, + "num_tokens": 23246659.0, + "reward": 1.3359375, + "reward_std": 1.07805635035038, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.12909944355487823, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.497555673122406, + "rewards/tag_count_reward/mean": 0.6171875, + "rewards/tag_count_reward/std": 0.48834507167339325, + "step": 679, + "token_counts/after_target": 931.25, + "token_counts/after_think": 25.5, + "token_counts/before_target": 2401.0, + "token_counts/before_think": 403.75 + }, + { + "avg_penalty/after_target": 2.3157201409339905, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.36785146594047546, + "avg_penalty/before_think": 0.33599018678069115, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.0, + "completions/max_terminated_length": 525.0, + "completions/mean_length": 213.15625, + "completions/mean_terminated_length": 213.15625, + "completions/min_length": 66.75, + "completions/min_terminated_length": 66.75, + "epoch": 0.34, + "grad_norm": 8.782319068908691, + "kl": 16.078125, + "learning_rate": 1.6704266189587992e-05, + "loss": 1.5126, + "num_tokens": 23270909.0, + "reward": 1.2578125, + "reward_std": 0.9444818198680878, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.4939897432923317, + "rewards/tag_count_reward/mean": 0.6484375, + "rewards/tag_count_reward/std": 0.4627397432923317, + "step": 680, + "token_counts/after_target": 704.75, + "token_counts/after_think": 24.5, + "token_counts/before_target": 2133.5, + "token_counts/before_think": 547.75 + }, + { + "avg_penalty/after_target": 2.97183495759964, + "avg_penalty/after_think": 0.0, + "avg_penalty/before_target": 0.32299626618623734, + "avg_penalty/before_think": 0.3105000779032707, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 224.03125, + "completions/mean_terminated_length": 224.03125, + "completions/min_length": 78.25, + "completions/min_terminated_length": 78.25, + "epoch": 0.3405, + "grad_norm": 18.20919418334961, + "kl": 15.21875, + "learning_rate": 1.6691306063588583e-05, + "loss": 1.5918, + "num_tokens": 23294031.0, + "reward": 1.16015625, + "reward_std": 1.032616600394249, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.5069767236709595, + "rewards/tag_count_reward/mean": 0.58203125, + "rewards/tag_count_reward/std": 0.495518758893013, + "step": 681, + "token_counts/after_target": 794.25, + "token_counts/after_think": 0.0, + "token_counts/before_target": 2256.25, + "token_counts/before_think": 534.0 + }, + { + "avg_penalty/after_target": 2.2972400784492493, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.35500283539295197, + "avg_penalty/before_think": 0.4801945090293884, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 247.0625, + "completions/mean_terminated_length": 247.0625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.341, + "grad_norm": 6.046574592590332, + "kl": 20.125, + "learning_rate": 1.6678325554710467e-05, + "loss": 1.7985, + "num_tokens": 23321475.0, + "reward": 1.10546875, + "reward_std": 0.9779802858829498, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.4939897432923317, + "rewards/tag_count_reward/mean": 0.55859375, + "rewards/tag_count_reward/std": 0.4863694831728935, + "step": 682, + "token_counts/after_target": 796.5, + "token_counts/after_think": 45.0, + "token_counts/before_target": 2706.0, + "token_counts/before_think": 405.5 + }, + { + "avg_penalty/after_target": 2.725339114665985, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3241914436221123, + "avg_penalty/before_think": 0.5391052067279816, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.75, + "completions/max_terminated_length": 532.75, + "completions/mean_length": 224.125, + "completions/mean_terminated_length": 224.125, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.3415, + "grad_norm": 3.296278953552246, + "kl": 21.375, + "learning_rate": 1.6665324702494524e-05, + "loss": 1.7995, + "num_tokens": 23344603.0, + "reward": 1.25390625, + "reward_std": 1.084810495376587, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.12909944355487823, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.500852182507515, + "rewards/tag_count_reward/mean": 0.58203125, + "rewards/tag_count_reward/std": 0.48514071106910706, + "step": 683, + "token_counts/after_target": 680.5, + "token_counts/after_think": 20.75, + "token_counts/before_target": 2433.75, + "token_counts/before_think": 451.0 + }, + { + "avg_penalty/after_target": 2.4298542141914368, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3601668030023575, + "avg_penalty/before_think": 0.43640825897455215, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 180.609375, + "completions/mean_terminated_length": 180.609375, + "completions/min_length": 68.75, + "completions/min_terminated_length": 68.75, + "epoch": 0.342, + "grad_norm": 6.185102939605713, + "kl": 13.0986328125, + "learning_rate": 1.665230354654361e-05, + "loss": 1.3341, + "num_tokens": 23365426.0, + "reward": 1.66015625, + "reward_std": 0.7109393179416656, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.11967839300632477, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.33539126068353653, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.31904733926057816, + "step": 684, + "token_counts/after_target": 442.0, + "token_counts/after_think": 23.0, + "token_counts/before_target": 1797.25, + "token_counts/before_think": 627.5 + }, + { + "avg_penalty/after_target": 3.8428134322166443, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.22868791222572327, + "avg_penalty/before_think": 0.334952000528574, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.5, + "completions/max_terminated_length": 465.5, + "completions/mean_length": 192.765625, + "completions/mean_terminated_length": 192.765625, + "completions/min_length": 67.75, + "completions/min_terminated_length": 67.75, + "epoch": 0.3425, + "grad_norm": 5.1348876953125, + "kl": 22.359375, + "learning_rate": 1.6639262126522417e-05, + "loss": 1.8954, + "num_tokens": 23389139.0, + "reward": 1.2890625, + "reward_std": 0.9188252091407776, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.48866813629865646, + "rewards/tag_count_reward/mean": 0.6640625, + "rewards/tag_count_reward/std": 0.44910678267478943, + "step": 685, + "token_counts/after_target": 519.25, + "token_counts/after_think": 9.0, + "token_counts/before_target": 2107.5, + "token_counts/before_think": 448.5 + }, + { + "avg_penalty/after_target": 2.4657487869262695, + "avg_penalty/after_think": 1.9460110664367676, + "avg_penalty/before_target": 0.4409053921699524, + "avg_penalty/before_think": 0.46260328218340874, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 773.25, + "completions/max_terminated_length": 773.25, + "completions/mean_length": 266.703125, + "completions/mean_terminated_length": 266.703125, + "completions/min_length": 44.75, + "completions/min_terminated_length": 44.75, + "epoch": 0.343, + "grad_norm": 3.3588151931762695, + "kl": 30.4375, + "learning_rate": 1.6626200482157378e-05, + "loss": 2.5272, + "num_tokens": 23417168.0, + "reward": 1.3125, + "reward_std": 0.948146715760231, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.49244368076324463, + "rewards/tag_count_reward/mean": 0.671875, + "rewards/tag_count_reward/std": 0.4638926088809967, + "step": 686, + "token_counts/after_target": 1149.25, + "token_counts/after_think": 38.5, + "token_counts/before_target": 2506.75, + "token_counts/before_think": 572.75 + }, + { + "avg_penalty/after_target": 2.6249742209911346, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.4021518751978874, + "avg_penalty/before_think": 0.4904988780617714, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.5, + "completions/max_terminated_length": 582.5, + "completions/mean_length": 213.734375, + "completions/mean_terminated_length": 213.734375, + "completions/min_length": 51.5, + "completions/min_terminated_length": 51.5, + "epoch": 0.3435, + "grad_norm": 3.8171069622039795, + "kl": 20.640625, + "learning_rate": 1.661311865323652e-05, + "loss": 1.8816, + "num_tokens": 23443487.0, + "reward": 1.5625, + "reward_std": 0.7748315185308456, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4106728211045265, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.3749659135937691, + "step": 687, + "token_counts/after_target": 637.25, + "token_counts/after_think": 15.25, + "token_counts/before_target": 2210.5, + "token_counts/before_think": 556.75 + }, + { + "avg_penalty/after_target": 2.3305703997612, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.37271638959646225, + "avg_penalty/before_think": 0.4755716174840927, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.75, + "completions/max_terminated_length": 652.75, + "completions/mean_length": 226.625, + "completions/mean_terminated_length": 226.625, + "completions/min_length": 74.25, + "completions/min_terminated_length": 74.25, + "epoch": 0.344, + "grad_norm": 10.050450325012207, + "kl": 26.96875, + "learning_rate": 1.660001667960937e-05, + "loss": 2.1174, + "num_tokens": 23469399.0, + "reward": 1.390625, + "reward_std": 0.8581761121749878, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.4745560586452484, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.4103594645857811, + "step": 688, + "token_counts/after_target": 645.0, + "token_counts/after_think": 51.5, + "token_counts/before_target": 2291.5, + "token_counts/before_think": 638.0 + }, + { + "avg_penalty/after_target": 2.758418023586273, + "avg_penalty/after_think": 3.942608416080475, + "avg_penalty/before_target": 0.6177019625902176, + "avg_penalty/before_think": 0.43781187385320663, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 857.25, + "completions/max_terminated_length": 745.75, + "completions/mean_length": 240.5, + "completions/mean_terminated_length": 215.7270851135254, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.3445, + "grad_norm": 20.320823669433594, + "kl": 38.71875, + "learning_rate": 1.6586894601186804e-05, + "loss": 2.8065, + "num_tokens": 23495111.0, + "reward": 1.10546875, + "reward_std": 0.8758482336997986, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.46875, + "rewards/format_reward/std": 0.5061737895011902, + "rewards/tag_count_reward/mean": 0.63671875, + "rewards/tag_count_reward/std": 0.4300270825624466, + "step": 689, + "token_counts/after_target": 1211.75, + "token_counts/after_think": 21.75, + "token_counts/before_target": 2200.75, + "token_counts/before_think": 413.75 + }, + { + "avg_penalty/after_target": 2.202647089958191, + "avg_penalty/after_think": 2.732132613658905, + "avg_penalty/before_target": 0.36416366696357727, + "avg_penalty/before_think": 0.5180529579520226, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.5, + "completions/max_terminated_length": 482.5, + "completions/mean_length": 227.78125, + "completions/mean_terminated_length": 227.78125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.345, + "grad_norm": 12.64722728729248, + "kl": 21.328125, + "learning_rate": 1.657375245794096e-05, + "loss": 1.6513, + "num_tokens": 23520313.0, + "reward": 1.40234375, + "reward_std": 0.76332987844944, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.49244368076324463, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.3426741361618042, + "step": 690, + "token_counts/after_target": 589.5, + "token_counts/after_think": 98.75, + "token_counts/before_target": 2022.75, + "token_counts/before_think": 933.5 + }, + { + "avg_penalty/after_target": 2.083701580762863, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.27482350543141365, + "avg_penalty/before_think": 0.45694446563720703, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.75, + "completions/max_terminated_length": 428.75, + "completions/mean_length": 153.09375, + "completions/mean_terminated_length": 153.09375, + "completions/min_length": 62.75, + "completions/min_terminated_length": 62.75, + "epoch": 0.3455, + "grad_norm": 7.518307209014893, + "kl": 21.28125, + "learning_rate": 1.6560590289905074e-05, + "loss": 1.6575, + "num_tokens": 23540911.0, + "reward": 1.66015625, + "reward_std": 0.7237547188997269, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38336414843797684, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.3542133793234825, + "step": 691, + "token_counts/after_target": 272.75, + "token_counts/after_think": 32.5, + "token_counts/before_target": 1466.75, + "token_counts/before_think": 677.5 + }, + { + "avg_penalty/after_target": 2.1349483132362366, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5131728425621986, + "avg_penalty/before_think": 0.35202979296445847, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 686.75, + "completions/max_terminated_length": 533.25, + "completions/mean_length": 203.984375, + "completions/mean_terminated_length": 190.4312515258789, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.346, + "grad_norm": 4.9378557205200195, + "kl": 28.234375, + "learning_rate": 1.6547408137173396e-05, + "loss": 2.4009, + "num_tokens": 23563150.0, + "reward": 1.58984375, + "reward_std": 0.7982907444238663, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4075859263539314, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.3828574940562248, + "step": 692, + "token_counts/after_target": 748.0, + "token_counts/after_think": 50.25, + "token_counts/before_target": 1755.75, + "token_counts/before_think": 709.75 + }, + { + "avg_penalty/after_target": 2.8227293491363525, + "avg_penalty/after_think": 2.67280113697052, + "avg_penalty/before_target": 0.3390537314116955, + "avg_penalty/before_think": 0.5712719485163689, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.5, + "completions/max_terminated_length": 589.5, + "completions/mean_length": 207.859375, + "completions/mean_terminated_length": 207.859375, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.3465, + "grad_norm": 8.559056282043457, + "kl": 19.0390625, + "learning_rate": 1.6534206039901057e-05, + "loss": 1.7916, + "num_tokens": 23584693.0, + "reward": 1.64453125, + "reward_std": 0.7517900913953781, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.3979102149605751, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.3481706455349922, + "step": 693, + "token_counts/after_target": 517.0, + "token_counts/after_think": 111.75, + "token_counts/before_target": 1718.5, + "token_counts/before_think": 978.5 + }, + { + "avg_penalty/after_target": 2.168231636285782, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.6142356991767883, + "avg_penalty/before_think": 0.5504778921604156, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.25, + "completions/max_terminated_length": 702.25, + "completions/mean_length": 316.796875, + "completions/mean_terminated_length": 316.796875, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.347, + "grad_norm": 3.8347980976104736, + "kl": 25.25, + "learning_rate": 1.6520984038303924e-05, + "loss": 2.1671, + "num_tokens": 23614984.0, + "reward": 1.1796875, + "reward_std": 0.8260995447635651, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.5030868947505951, + "rewards/tag_count_reward/mean": 0.6328125, + "rewards/tag_count_reward/std": 0.4001261070370674, + "step": 694, + "token_counts/after_target": 1576.0, + "token_counts/after_think": 125.5, + "token_counts/before_target": 2390.5, + "token_counts/before_think": 976.75 + }, + { + "avg_penalty/after_target": 2.2213212847709656, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.567300982773304, + "avg_penalty/before_think": 0.6263722665607929, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 653.25, + "completions/max_terminated_length": 653.25, + "completions/mean_length": 325.78125, + "completions/mean_terminated_length": 325.78125, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.3475, + "grad_norm": 12.87842845916748, + "kl": 32.0, + "learning_rate": 1.650774217265851e-05, + "loss": 2.3805, + "num_tokens": 23647050.0, + "reward": 0.98046875, + "reward_std": 0.7666276842355728, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.359375, + "rewards/format_reward/std": 0.4876555874943733, + "rewards/tag_count_reward/mean": 0.62109375, + "rewards/tag_count_reward/std": 0.36799657344818115, + "step": 695, + "token_counts/after_target": 1461.5, + "token_counts/after_think": 117.25, + "token_counts/before_target": 2809.25, + "token_counts/before_think": 824.5 + }, + { + "avg_penalty/after_target": 2.1468453407287598, + "avg_penalty/after_think": 2.977500855922699, + "avg_penalty/before_target": 0.3496513068675995, + "avg_penalty/before_think": 0.9265371412038803, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 633.25, + "completions/max_terminated_length": 633.25, + "completions/mean_length": 320.171875, + "completions/mean_terminated_length": 320.171875, + "completions/min_length": 93.25, + "completions/min_terminated_length": 93.25, + "epoch": 0.348, + "grad_norm": 11.242276191711426, + "kl": 24.0625, + "learning_rate": 1.6494480483301836e-05, + "loss": 1.7817, + "num_tokens": 23682389.0, + "reward": 1.0078125, + "reward_std": 0.7648659497499466, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.47360680997371674, + "rewards/tag_count_reward/mean": 0.6328125, + "rewards/tag_count_reward/std": 0.34007831662893295, + "step": 696, + "token_counts/after_target": 795.5, + "token_counts/after_think": 232.5, + "token_counts/before_target": 3151.75, + "token_counts/before_think": 943.0 + }, + { + "avg_penalty/after_target": 2.5066517889499664, + "avg_penalty/after_think": 2.4629084169864655, + "avg_penalty/before_target": 0.2944902814924717, + "avg_penalty/before_think": 0.5497612208127975, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.25, + "completions/max_terminated_length": 616.25, + "completions/mean_length": 229.1875, + "completions/mean_terminated_length": 229.1875, + "completions/min_length": 59.5, + "completions/min_terminated_length": 59.5, + "epoch": 0.3485, + "grad_norm": 4.2210187911987305, + "kl": 20.0, + "learning_rate": 1.6481199010631312e-05, + "loss": 1.6713, + "num_tokens": 23706625.0, + "reward": 1.30078125, + "reward_std": 0.8664395064115524, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.5625, + "rewards/format_reward/std": 0.5061737895011902, + "rewards/tag_count_reward/mean": 0.72265625, + "rewards/tag_count_reward/std": 0.37832390516996384, + "step": 697, + "token_counts/after_target": 566.0, + "token_counts/after_think": 89.5, + "token_counts/before_target": 2098.5, + "token_counts/before_think": 913.0 + }, + { + "avg_penalty/after_target": 2.857107102870941, + "avg_penalty/after_think": 3.165485590696335, + "avg_penalty/before_target": 0.3018474839627743, + "avg_penalty/before_think": 0.4674641713500023, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.75, + "completions/max_terminated_length": 398.75, + "completions/mean_length": 181.546875, + "completions/mean_terminated_length": 181.546875, + "completions/min_length": 42.25, + "completions/min_terminated_length": 42.25, + "epoch": 0.349, + "grad_norm": 10.754042625427246, + "kl": 10.91015625, + "learning_rate": 1.64678977951046e-05, + "loss": 1.1544, + "num_tokens": 23727940.0, + "reward": 1.5546875, + "reward_std": 0.7334400713443756, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4493217319250107, + "rewards/tag_count_reward/mean": 0.8359375, + "rewards/tag_count_reward/std": 0.31748950481414795, + "step": 698, + "token_counts/after_target": 406.0, + "token_counts/after_think": 73.25, + "token_counts/before_target": 1593.25, + "token_counts/before_think": 832.25 + }, + { + "avg_penalty/after_target": 2.777021288871765, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.2789624519646168, + "avg_penalty/before_think": 0.43529632687568665, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.0, + "completions/max_terminated_length": 573.0, + "completions/mean_length": 190.953125, + "completions/mean_terminated_length": 190.953125, + "completions/min_length": 53.25, + "completions/min_terminated_length": 53.25, + "epoch": 0.3495, + "grad_norm": 11.02344799041748, + "kl": 12.734375, + "learning_rate": 1.645457687723951e-05, + "loss": 1.364, + "num_tokens": 23752017.0, + "reward": 1.57421875, + "reward_std": 0.758187860250473, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4308478757739067, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.36346903443336487, + "step": 699, + "token_counts/after_target": 375.75, + "token_counts/after_think": 25.0, + "token_counts/before_target": 1911.5, + "token_counts/before_think": 743.0 + }, + { + "avg_penalty/after_target": 2.5143080055713654, + "avg_penalty/after_think": 3.979632258415222, + "avg_penalty/before_target": 0.2523280903697014, + "avg_penalty/before_think": 0.43952982872724533, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.25, + "completions/max_terminated_length": 379.25, + "completions/mean_length": 171.6875, + "completions/mean_terminated_length": 171.6875, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.35, + "grad_norm": 5.049373626708984, + "kl": 7.0634765625, + "learning_rate": 1.644123629761387e-05, + "loss": 0.76, + "num_tokens": 23771261.0, + "reward": 1.640625, + "reward_std": 0.58283631503582, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.3454566150903702, + "rewards/tag_count_reward/mean": 0.859375, + "rewards/tag_count_reward/std": 0.2669786885380745, + "step": 700, + "token_counts/after_target": 198.25, + "token_counts/after_think": 89.0, + "token_counts/before_target": 1612.0, + "token_counts/before_think": 847.75 + }, + { + "avg_penalty/after_target": 2.1736857891082764, + "avg_penalty/after_think": 3.9579805731773376, + "avg_penalty/before_target": 0.3378615751862526, + "avg_penalty/before_think": 0.5517077520489693, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.25, + "completions/max_terminated_length": 458.25, + "completions/mean_length": 212.03125, + "completions/mean_terminated_length": 212.03125, + "completions/min_length": 46.75, + "completions/min_terminated_length": 46.75, + "epoch": 0.3505, + "grad_norm": 9.266314506530762, + "kl": 11.609375, + "learning_rate": 1.6427876096865394e-05, + "loss": 1.2319, + "num_tokens": 23794415.0, + "reward": 1.46484375, + "reward_std": 0.8336807787418365, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.47360680997371674, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.38019274920225143, + "step": 701, + "token_counts/after_target": 351.25, + "token_counts/after_think": 103.5, + "token_counts/before_target": 1925.5, + "token_counts/before_think": 1012.25 + }, + { + "avg_penalty/after_target": 2.4351671934127808, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3469664379954338, + "avg_penalty/before_think": 0.45017221570014954, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.25, + "completions/max_terminated_length": 407.25, + "completions/mean_length": 166.765625, + "completions/mean_terminated_length": 166.765625, + "completions/min_length": 26.75, + "completions/min_terminated_length": 26.75, + "epoch": 0.351, + "grad_norm": 7.725602626800537, + "kl": 12.046875, + "learning_rate": 1.641449631569158e-05, + "loss": 1.2834, + "num_tokens": 23816400.0, + "reward": 1.5, + "reward_std": 0.8600585609674454, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.125, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4713720977306366, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.37161924690008163, + "step": 702, + "token_counts/after_target": 312.5, + "token_counts/after_think": 147.75, + "token_counts/before_target": 1410.0, + "token_counts/before_think": 798.0 + }, + { + "avg_penalty/after_target": 2.975058078765869, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.27286161482334137, + "avg_penalty/before_think": 0.4832029193639755, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.25, + "completions/max_terminated_length": 433.25, + "completions/mean_length": 181.65625, + "completions/mean_terminated_length": 181.65625, + "completions/min_length": 23.5, + "completions/min_terminated_length": 23.5, + "epoch": 0.3515, + "grad_norm": 4.999207496643066, + "kl": 15.015625, + "learning_rate": 1.6401096994849558e-05, + "loss": 1.2216, + "num_tokens": 23837882.0, + "reward": 1.41796875, + "reward_std": 0.7998262643814087, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.4665650501847267, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.36829036846756935, + "step": 703, + "token_counts/after_target": 278.0, + "token_counts/after_think": 61.25, + "token_counts/before_target": 1773.25, + "token_counts/before_think": 794.0 + }, + { + "avg_penalty/after_target": 2.441582143306732, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.27607912570238113, + "avg_penalty/before_think": 0.41348033770918846, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.75, + "completions/max_terminated_length": 506.75, + "completions/mean_length": 182.84375, + "completions/mean_terminated_length": 182.84375, + "completions/min_length": 56.75, + "completions/min_terminated_length": 56.75, + "epoch": 0.352, + "grad_norm": 6.952452659606934, + "kl": 18.765625, + "learning_rate": 1.638767817515598e-05, + "loss": 1.4676, + "num_tokens": 23863168.0, + "reward": 1.45703125, + "reward_std": 0.8155427575111389, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4704566150903702, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.3760947547852993, + "step": 704, + "token_counts/after_target": 329.25, + "token_counts/after_think": 22.0, + "token_counts/before_target": 1641.5, + "token_counts/before_think": 932.75 + }, + { + "avg_penalty/after_target": 2.9598105549812317, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.21140609309077263, + "avg_penalty/before_think": 0.42509925365448, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 187.328125, + "completions/mean_terminated_length": 187.328125, + "completions/min_length": 48.75, + "completions/min_terminated_length": 48.75, + "epoch": 0.3525, + "grad_norm": 2.965284824371338, + "kl": 9.46875, + "learning_rate": 1.63742398974869e-05, + "loss": 0.859, + "num_tokens": 23883861.0, + "reward": 1.6015625, + "reward_std": 0.7055931687355042, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.39964763820171356, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.32217568904161453, + "step": 705, + "token_counts/after_target": 380.75, + "token_counts/after_think": 55.5, + "token_counts/before_target": 1513.5, + "token_counts/before_think": 1047.5 + }, + { + "avg_penalty/after_target": 3.4644219279289246, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.24360565841197968, + "avg_penalty/before_think": 0.4490375220775604, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.5, + "completions/max_terminated_length": 399.5, + "completions/mean_length": 170.90625, + "completions/mean_terminated_length": 170.90625, + "completions/min_length": 19.5, + "completions/min_terminated_length": 19.5, + "epoch": 0.353, + "grad_norm": 3.857682943344116, + "kl": 16.515625, + "learning_rate": 1.636078220277764e-05, + "loss": 1.3197, + "num_tokens": 23903327.0, + "reward": 1.296875, + "reward_std": 0.910567969083786, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.48866813629865646, + "rewards/tag_count_reward/mean": 0.671875, + "rewards/tag_count_reward/std": 0.4436829164624214, + "step": 706, + "token_counts/after_target": 446.0, + "token_counts/after_think": 15.25, + "token_counts/before_target": 1608.0, + "token_counts/before_think": 665.25 + }, + { + "avg_penalty/after_target": 2.787928193807602, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.2672165594995022, + "avg_penalty/before_think": 0.48424380272626877, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 171.203125, + "completions/mean_terminated_length": 171.203125, + "completions/min_length": 55.5, + "completions/min_terminated_length": 55.5, + "epoch": 0.3535, + "grad_norm": 4.975509166717529, + "kl": 12.21875, + "learning_rate": 1.6347305132022677e-05, + "loss": 1.1508, + "num_tokens": 23925292.0, + "reward": 1.59375, + "reward_std": 0.7535766214132309, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4361884370446205, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.33756352588534355, + "step": 707, + "token_counts/after_target": 225.0, + "token_counts/after_think": 41.75, + "token_counts/before_target": 1662.75, + "token_counts/before_think": 809.75 + }, + { + "avg_penalty/after_target": 2.5018815398216248, + "avg_penalty/after_think": 3.953591525554657, + "avg_penalty/before_target": 0.402489822357893, + "avg_penalty/before_think": 0.5820796340703964, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.0, + "completions/max_terminated_length": 565.0, + "completions/mean_length": 242.9375, + "completions/mean_terminated_length": 242.9375, + "completions/min_length": 75.5, + "completions/min_terminated_length": 75.5, + "epoch": 0.354, + "grad_norm": 7.201395034790039, + "kl": 18.5, + "learning_rate": 1.6333808726275503e-05, + "loss": 1.5753, + "num_tokens": 23951688.0, + "reward": 1.0, + "reward_std": 0.8605970144271851, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.421875, + "rewards/format_reward/std": 0.49467839300632477, + "rewards/tag_count_reward/mean": 0.578125, + "rewards/tag_count_reward/std": 0.40540356934070587, + "step": 708, + "token_counts/after_target": 718.25, + "token_counts/after_think": 162.25, + "token_counts/before_target": 2112.25, + "token_counts/before_think": 894.25 + }, + { + "avg_penalty/after_target": 2.7911482453346252, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.30829983204603195, + "avg_penalty/before_think": 0.6059276238083839, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 265.953125, + "completions/mean_terminated_length": 265.953125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.3545, + "grad_norm": 8.168225288391113, + "kl": 15.09375, + "learning_rate": 1.632029302664851e-05, + "loss": 1.3192, + "num_tokens": 23977477.0, + "reward": 0.9296875, + "reward_std": 0.7870249003171921, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.328125, + "rewards/format_reward/std": 0.46875541657209396, + "rewards/tag_count_reward/mean": 0.6015625, + "rewards/tag_count_reward/std": 0.37144962698221207, + "step": 709, + "token_counts/after_target": 733.75, + "token_counts/after_think": 133.0, + "token_counts/before_target": 1887.0, + "token_counts/before_think": 1501.5 + }, + { + "avg_penalty/after_target": 3.370861232280731, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.2615714520215988, + "avg_penalty/before_think": 0.4272397980093956, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.25, + "completions/max_terminated_length": 378.25, + "completions/mean_length": 176.890625, + "completions/mean_terminated_length": 176.890625, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.355, + "grad_norm": 10.704628944396973, + "kl": 14.171875, + "learning_rate": 1.6306758074312866e-05, + "loss": 1.4148, + "num_tokens": 23997390.0, + "reward": 1.6484375, + "reward_std": 0.720800131559372, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.37366948276758194, + "rewards/tag_count_reward/mean": 0.8359375, + "rewards/tag_count_reward/std": 0.35294026881456375, + "step": 710, + "token_counts/after_target": 387.0, + "token_counts/after_think": 19.0, + "token_counts/before_target": 1546.75, + "token_counts/before_think": 877.5 + }, + { + "avg_penalty/after_target": 3.1899473667144775, + "avg_penalty/after_think": 2.9822295904159546, + "avg_penalty/before_target": 0.28988830745220184, + "avg_penalty/before_think": 0.5177756175398827, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.75, + "completions/max_terminated_length": 573.75, + "completions/mean_length": 234.109375, + "completions/mean_terminated_length": 234.109375, + "completions/min_length": 69.75, + "completions/min_terminated_length": 69.75, + "epoch": 0.3555, + "grad_norm": 11.136197090148926, + "kl": 23.03125, + "learning_rate": 1.6293203910498375e-05, + "loss": 1.7702, + "num_tokens": 24022357.0, + "reward": 1.0859375, + "reward_std": 0.7947375923395157, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.453125, + "rewards/format_reward/std": 0.46513500809669495, + "rewards/tag_count_reward/mean": 0.6328125, + "rewards/tag_count_reward/std": 0.3656652867794037, + "step": 711, + "token_counts/after_target": 760.25, + "token_counts/after_think": 44.0, + "token_counts/before_target": 2105.0, + "token_counts/before_think": 836.5 + }, + { + "avg_penalty/after_target": 1.9443695843219757, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.5413042977452278, + "avg_penalty/before_think": 0.5032208412885666, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 826.75, + "completions/max_terminated_length": 785.0, + "completions/mean_length": 298.1875, + "completions/mean_terminated_length": 287.8625030517578, + "completions/min_length": 77.5, + "completions/min_terminated_length": 77.5, + "epoch": 0.356, + "grad_norm": 11.372245788574219, + "kl": 33.90625, + "learning_rate": 1.6279630576493383e-05, + "loss": 2.6707, + "num_tokens": 24053985.0, + "reward": 1.390625, + "reward_std": 0.8869766145944595, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.47354350984096527, + "rewards/tag_count_reward/mean": 0.71875, + "rewards/tag_count_reward/std": 0.43208809196949005, + "step": 712, + "token_counts/after_target": 1351.5, + "token_counts/after_think": 61.5, + "token_counts/before_target": 2621.25, + "token_counts/before_think": 736.75 + }, + { + "avg_penalty/after_target": 3.1464051604270935, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.34629228711128235, + "avg_penalty/before_think": 0.38246654346585274, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.5, + "completions/max_terminated_length": 506.5, + "completions/mean_length": 218.796875, + "completions/mean_terminated_length": 218.796875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.3565, + "grad_norm": 8.297054290771484, + "kl": 28.75, + "learning_rate": 1.6266038113644605e-05, + "loss": 2.3105, + "num_tokens": 24076980.0, + "reward": 1.3515625, + "reward_std": 0.9264561831951141, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.48456869274377823, + "rewards/tag_count_reward/mean": 0.6953125, + "rewards/tag_count_reward/std": 0.4563927799463272, + "step": 713, + "token_counts/after_target": 895.25, + "token_counts/after_think": 18.5, + "token_counts/before_target": 1853.5, + "token_counts/before_think": 733.5 + }, + { + "avg_penalty/after_target": 2.6368795037269592, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.4139551371335983, + "avg_penalty/before_think": 0.40918823331594467, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 617.25, + "completions/max_terminated_length": 617.25, + "completions/mean_length": 225.328125, + "completions/mean_terminated_length": 225.328125, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.357, + "grad_norm": 3.485790252685547, + "kl": 24.28125, + "learning_rate": 1.6252426563357054e-05, + "loss": 1.9641, + "num_tokens": 24104473.0, + "reward": 1.46875, + "reward_std": 0.9177352041006088, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.44938503205776215, + "step": 714, + "token_counts/after_target": 635.5, + "token_counts/after_think": 27.0, + "token_counts/before_target": 2110.25, + "token_counts/before_think": 832.5 + }, + { + "avg_penalty/after_target": 2.144437700510025, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5269965678453445, + "avg_penalty/before_think": 0.5689968019723892, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 742.25, + "completions/max_terminated_length": 742.25, + "completions/mean_length": 268.59375, + "completions/mean_terminated_length": 268.59375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.3575, + "grad_norm": 2.1609885692596436, + "kl": 20.25, + "learning_rate": 1.6238795967093865e-05, + "loss": 1.8508, + "num_tokens": 24133151.0, + "reward": 1.578125, + "reward_std": 0.867702916264534, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.42516325414180756, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.42516325414180756, + "step": 715, + "token_counts/after_target": 885.0, + "token_counts/after_think": 95.0, + "token_counts/before_target": 2199.75, + "token_counts/before_think": 1117.75 + }, + { + "avg_penalty/after_target": 2.3106614351272583, + "avg_penalty/after_think": 2.9297946095466614, + "avg_penalty/before_target": 0.3569481447339058, + "avg_penalty/before_think": 0.5647782385349274, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.75, + "completions/max_terminated_length": 579.75, + "completions/mean_length": 235.046875, + "completions/mean_terminated_length": 235.046875, + "completions/min_length": 73.25, + "completions/min_terminated_length": 73.25, + "epoch": 0.358, + "grad_norm": 5.016480922698975, + "kl": 22.53125, + "learning_rate": 1.6225146366376198e-05, + "loss": 1.7696, + "num_tokens": 24156834.0, + "reward": 1.44921875, + "reward_std": 0.9121699780225754, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.46296359598636627, + "rewards/tag_count_reward/mean": 0.73046875, + "rewards/tag_count_reward/std": 0.45384427160024643, + "step": 716, + "token_counts/after_target": 630.25, + "token_counts/after_think": 53.5, + "token_counts/before_target": 2085.25, + "token_counts/before_think": 991.75 + }, + { + "avg_penalty/after_target": 2.4925502240657806, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.38265272602438927, + "avg_penalty/before_think": 0.48086051642894745, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.25, + "completions/max_terminated_length": 656.25, + "completions/mean_length": 242.109375, + "completions/mean_terminated_length": 242.109375, + "completions/min_length": 101.25, + "completions/min_terminated_length": 101.25, + "epoch": 0.3585, + "grad_norm": 7.088281154632568, + "kl": 17.15625, + "learning_rate": 1.6211477802783105e-05, + "loss": 1.6356, + "num_tokens": 24180377.0, + "reward": 1.59375, + "reward_std": 0.7818873524665833, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.37197671830654144, + "step": 717, + "token_counts/after_target": 659.75, + "token_counts/after_think": 55.75, + "token_counts/before_target": 2233.25, + "token_counts/before_think": 925.0 + }, + { + "avg_penalty/after_target": 2.7082466185092926, + "avg_penalty/after_think": 2.923352897167206, + "avg_penalty/before_target": 0.3550182729959488, + "avg_penalty/before_think": 0.42175282165408134, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.0, + "completions/max_terminated_length": 549.0, + "completions/mean_length": 208.421875, + "completions/mean_terminated_length": 208.421875, + "completions/min_length": 59.5, + "completions/min_terminated_length": 59.5, + "epoch": 0.359, + "grad_norm": 5.1840434074401855, + "kl": 19.734375, + "learning_rate": 1.6197790317951403e-05, + "loss": 1.6891, + "num_tokens": 24202148.0, + "reward": 1.4453125, + "reward_std": 0.9059081524610519, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.45057061314582825, + "step": 718, + "token_counts/after_target": 622.75, + "token_counts/after_think": 64.0, + "token_counts/before_target": 1805.25, + "token_counts/before_think": 842.75 + }, + { + "avg_penalty/after_target": 2.4251630306243896, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.45794927328824997, + "avg_penalty/before_think": 0.39884861558675766, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 740.25, + "completions/max_terminated_length": 740.25, + "completions/mean_length": 249.890625, + "completions/mean_terminated_length": 237.89791870117188, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.3595, + "grad_norm": 3.1070311069488525, + "kl": 30.25, + "learning_rate": 1.6184083953575543e-05, + "loss": 2.4174, + "num_tokens": 24227725.0, + "reward": 1.30078125, + "reward_std": 0.9615532606840134, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.48989029973745346, + "rewards/tag_count_reward/mean": 0.66015625, + "rewards/tag_count_reward/std": 0.47794677317142487, + "step": 719, + "token_counts/after_target": 1006.25, + "token_counts/after_think": 31.0, + "token_counts/before_target": 2299.0, + "token_counts/before_think": 662.0 + }, + { + "avg_penalty/after_target": 2.5327278077602386, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.38898368924856186, + "avg_penalty/before_think": 0.3117539510130882, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.5, + "completions/max_terminated_length": 482.5, + "completions/mean_length": 198.28125, + "completions/mean_terminated_length": 198.28125, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.36, + "grad_norm": 10.002366065979004, + "kl": 13.8515625, + "learning_rate": 1.617035875140749e-05, + "loss": 1.3541, + "num_tokens": 24248959.0, + "reward": 1.62890625, + "reward_std": 0.7909648269414902, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3987511098384857, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.3926909863948822, + "step": 720, + "token_counts/after_target": 556.5, + "token_counts/after_think": 34.25, + "token_counts/before_target": 1489.25, + "token_counts/before_think": 1092.5 + }, + { + "avg_penalty/after_target": 2.4248445332050323, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5499993115663528, + "avg_penalty/before_think": 0.5189140364527702, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 801.5, + "completions/max_terminated_length": 616.75, + "completions/mean_length": 237.078125, + "completions/mean_terminated_length": 211.49896621704102, + "completions/min_length": 74.75, + "completions/min_terminated_length": 74.75, + "epoch": 0.3605, + "grad_norm": 19.584543228149414, + "kl": 16.8984375, + "learning_rate": 1.6156614753256583e-05, + "loss": 2.0591, + "num_tokens": 24279364.0, + "reward": 1.73046875, + "reward_std": 0.6623103767633438, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3450859263539314, + "rewards/tag_count_reward/mean": 0.87109375, + "rewards/tag_count_reward/std": 0.3230995759367943, + "step": 721, + "token_counts/after_target": 1001.0, + "token_counts/after_think": 111.5, + "token_counts/before_target": 1705.0, + "token_counts/before_think": 975.75 + }, + { + "avg_penalty/after_target": 2.2366007268428802, + "avg_penalty/after_think": 3.4821589589118958, + "avg_penalty/before_target": 0.38009944558143616, + "avg_penalty/before_think": 0.49235614389181137, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 747.25, + "completions/max_terminated_length": 747.25, + "completions/mean_length": 254.828125, + "completions/mean_terminated_length": 254.828125, + "completions/min_length": 105.5, + "completions/min_terminated_length": 105.5, + "epoch": 0.361, + "grad_norm": 8.724885940551758, + "kl": 13.7421875, + "learning_rate": 1.6142852000989432e-05, + "loss": 1.5156, + "num_tokens": 24307273.0, + "reward": 1.75390625, + "reward_std": 0.6378519386053085, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3221946656703949, + "rewards/tag_count_reward/mean": 0.87890625, + "rewards/tag_count_reward/std": 0.3161345422267914, + "step": 722, + "token_counts/after_target": 770.5, + "token_counts/after_think": 43.25, + "token_counts/before_target": 2020.5, + "token_counts/before_think": 1243.0 + }, + { + "avg_penalty/after_target": 2.7334148287773132, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.6547419130802155, + "avg_penalty/before_think": 0.6277299635112286, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 792.75, + "completions/max_terminated_length": 706.75, + "completions/mean_length": 314.046875, + "completions/mean_terminated_length": 282.60521697998047, + "completions/min_length": 79.25, + "completions/min_terminated_length": 79.25, + "epoch": 0.3615, + "grad_norm": 4.1670050621032715, + "kl": 24.6484375, + "learning_rate": 1.6129070536529767e-05, + "loss": 2.292, + "num_tokens": 24335948.0, + "reward": 1.54296875, + "reward_std": 0.8055422008037567, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.41110680997371674, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.39935222268104553, + "step": 723, + "token_counts/after_target": 1562.75, + "token_counts/after_think": 99.0, + "token_counts/before_target": 2234.75, + "token_counts/before_think": 1128.25 + }, + { + "avg_penalty/after_target": 2.545827805995941, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.35087933391332626, + "avg_penalty/before_think": 0.5666512250900269, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.5, + "completions/max_terminated_length": 564.5, + "completions/mean_length": 220.484375, + "completions/mean_terminated_length": 220.484375, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.362, + "grad_norm": 4.116824150085449, + "kl": 17.21875, + "learning_rate": 1.6115270401858315e-05, + "loss": 1.5258, + "num_tokens": 24362235.0, + "reward": 1.6171875, + "reward_std": 0.7763454765081406, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4097762927412987, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.37442560493946075, + "step": 724, + "token_counts/after_target": 496.0, + "token_counts/after_think": 105.0, + "token_counts/before_target": 1914.25, + "token_counts/before_think": 1012.5 + }, + { + "avg_penalty/after_target": 2.7503538727760315, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3061496913433075, + "avg_penalty/before_think": 0.4744712710380554, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.25, + "completions/max_terminated_length": 497.25, + "completions/mean_length": 233.71875, + "completions/mean_terminated_length": 233.71875, + "completions/min_length": 75.75, + "completions/min_terminated_length": 75.75, + "epoch": 0.3625, + "grad_norm": 3.7019999027252197, + "kl": 13.3046875, + "learning_rate": 1.610145163901268e-05, + "loss": 1.2934, + "num_tokens": 24388761.0, + "reward": 1.71875, + "reward_std": 0.6988953948020935, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.34944770485162735, + "rewards/tag_count_reward/mean": 0.859375, + "rewards/tag_count_reward/std": 0.34944770485162735, + "step": 725, + "token_counts/after_target": 457.75, + "token_counts/after_think": 297.5, + "token_counts/before_target": 1440.5, + "token_counts/before_think": 1543.75 + }, + { + "avg_penalty/after_target": 2.9268625378608704, + "avg_penalty/after_think": 2.7750036120414734, + "avg_penalty/before_target": 0.5661161988973618, + "avg_penalty/before_think": 0.48074114695191383, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 699.0, + "completions/max_terminated_length": 599.75, + "completions/mean_length": 244.640625, + "completions/mean_terminated_length": 220.4197998046875, + "completions/min_length": 69.25, + "completions/min_terminated_length": 69.25, + "epoch": 0.363, + "grad_norm": 13.783076286315918, + "kl": 34.03125, + "learning_rate": 1.608761429008721e-05, + "loss": 2.6436, + "num_tokens": 24412530.0, + "reward": 1.5, + "reward_std": 0.8017352372407913, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.42707233130931854, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.38369544595479965, + "step": 726, + "token_counts/after_target": 1149.25, + "token_counts/after_think": 69.75, + "token_counts/before_target": 1760.5, + "token_counts/before_think": 934.75 + }, + { + "avg_penalty/after_target": 2.8525266647338867, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.28904079645872116, + "avg_penalty/before_think": 0.5807360634207726, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.75, + "completions/max_terminated_length": 556.75, + "completions/mean_length": 228.25, + "completions/mean_terminated_length": 228.25, + "completions/min_length": 87.5, + "completions/min_terminated_length": 87.5, + "epoch": 0.3635, + "grad_norm": 18.634685516357422, + "kl": 26.765625, + "learning_rate": 1.607375839723287e-05, + "loss": 1.8629, + "num_tokens": 24440098.0, + "reward": 1.5, + "reward_std": 0.7718984186649323, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4503342807292938, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.3529936149716377, + "step": 727, + "token_counts/after_target": 481.75, + "token_counts/after_think": 93.25, + "token_counts/before_target": 2027.0, + "token_counts/before_think": 1050.0 + }, + { + "avg_penalty/after_target": 2.65713232755661, + "avg_penalty/after_think": 3.390119194984436, + "avg_penalty/before_target": 0.27642684802412987, + "avg_penalty/before_think": 0.7400415539741516, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 560.0, + "completions/max_terminated_length": 560.0, + "completions/mean_length": 237.46875, + "completions/mean_terminated_length": 237.46875, + "completions/min_length": 61.75, + "completions/min_terminated_length": 61.75, + "epoch": 0.364, + "grad_norm": 44.00825881958008, + "kl": 42.4375, + "learning_rate": 1.605988400265711e-05, + "loss": 2.4056, + "num_tokens": 24466112.0, + "reward": 1.046875, + "reward_std": 0.8584441989660263, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.421875, + "rewards/format_reward/std": 0.5049516260623932, + "rewards/tag_count_reward/mean": 0.625, + "rewards/tag_count_reward/std": 0.40822338312864304, + "step": 728, + "token_counts/after_target": 531.0, + "token_counts/after_think": 98.5, + "token_counts/before_target": 2402.5, + "token_counts/before_think": 767.5 + }, + { + "avg_penalty/after_target": 1.9116131663322449, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.397376112639904, + "avg_penalty/before_think": 0.420404564589262, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 609.25, + "completions/max_terminated_length": 530.75, + "completions/mean_length": 258.171875, + "completions/mean_terminated_length": 247.80937957763672, + "completions/min_length": 89.5, + "completions/min_terminated_length": 89.5, + "epoch": 0.3645, + "grad_norm": 37.15560531616211, + "kl": 35.1875, + "learning_rate": 1.6045991148623752e-05, + "loss": 2.0507, + "num_tokens": 24495227.0, + "reward": 1.00390625, + "reward_std": 0.7541041225194931, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.34375, + "rewards/format_reward/std": 0.4697679653763771, + "rewards/tag_count_reward/mean": 0.66015625, + "rewards/tag_count_reward/std": 0.3471953496336937, + "step": 729, + "token_counts/after_target": 710.75, + "token_counts/after_think": 42.0, + "token_counts/before_target": 2416.5, + "token_counts/before_think": 961.5 + }, + { + "avg_penalty/after_target": 2.4887706637382507, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.397168792784214, + "avg_penalty/before_think": 0.8021383583545685, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 819.5, + "completions/max_terminated_length": 649.0, + "completions/mean_length": 345.734375, + "completions/mean_terminated_length": 324.1489715576172, + "completions/min_length": 86.75, + "completions/min_terminated_length": 86.75, + "epoch": 0.365, + "grad_norm": 24.789270401000977, + "kl": 39.8125, + "learning_rate": 1.6032079877452825e-05, + "loss": 2.6325, + "num_tokens": 24533706.0, + "reward": 0.97265625, + "reward_std": 0.8725642114877701, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.390625, + "rewards/format_reward/std": 0.48989029973745346, + "rewards/tag_count_reward/mean": 0.58203125, + "rewards/tag_count_reward/std": 0.4314998537302017, + "step": 730, + "token_counts/after_target": 1671.5, + "token_counts/after_think": 70.75, + "token_counts/before_target": 2835.5, + "token_counts/before_think": 954.0 + }, + { + "avg_penalty/after_target": 2.3771859407424927, + "avg_penalty/after_think": 2.6745763421058655, + "avg_penalty/before_target": 0.46226345747709274, + "avg_penalty/before_think": 0.5602616295218468, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 949.75, + "completions/max_terminated_length": 795.0, + "completions/mean_length": 316.3125, + "completions/mean_terminated_length": 292.5166702270508, + "completions/min_length": 60.5, + "completions/min_terminated_length": 60.5, + "epoch": 0.3655, + "grad_norm": 8.838038444519043, + "kl": 27.8125, + "learning_rate": 1.6018150231520486e-05, + "loss": 2.1085, + "num_tokens": 24564382.0, + "reward": 1.2421875, + "reward_std": 0.8456712812185287, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.53125, + "rewards/format_reward/std": 0.5102732330560684, + "rewards/tag_count_reward/mean": 0.7109375, + "rewards/tag_count_reward/std": 0.3764825612306595, + "step": 731, + "token_counts/after_target": 1150.75, + "token_counts/after_think": 172.25, + "token_counts/before_target": 2635.5, + "token_counts/before_think": 1102.5 + }, + { + "avg_penalty/after_target": 2.4472284018993378, + "avg_penalty/after_think": 1.9151585102081299, + "avg_penalty/before_target": 0.36475419253110886, + "avg_penalty/before_think": 0.4350610300898552, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.25, + "completions/max_terminated_length": 539.25, + "completions/mean_length": 280.546875, + "completions/mean_terminated_length": 280.546875, + "completions/min_length": 64.75, + "completions/min_terminated_length": 64.75, + "epoch": 0.366, + "grad_norm": 7.752838611602783, + "kl": 21.125, + "learning_rate": 1.6004202253258844e-05, + "loss": 1.5718, + "num_tokens": 24592257.0, + "reward": 1.15234375, + "reward_std": 0.9071794897317886, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.515625, + "rewards/format_reward/std": 0.5092606842517853, + "rewards/tag_count_reward/mean": 0.63671875, + "rewards/tag_count_reward/std": 0.43300212919712067, + "step": 732, + "token_counts/after_target": 690.5, + "token_counts/after_think": 114.0, + "token_counts/before_target": 2391.75, + "token_counts/before_think": 1292.5 + }, + { + "avg_penalty/after_target": 1.9690164923667908, + "avg_penalty/after_think": 3.7465838193893433, + "avg_penalty/before_target": 0.3693135790526867, + "avg_penalty/before_think": 0.5915762335062027, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 298.78125, + "completions/mean_terminated_length": 298.78125, + "completions/min_length": 81.25, + "completions/min_terminated_length": 81.25, + "epoch": 0.3665, + "grad_norm": 6.480328559875488, + "kl": 15.359375, + "learning_rate": 1.599023598515586e-05, + "loss": 1.383, + "num_tokens": 24619875.0, + "reward": 1.3984375, + "reward_std": 0.8503440022468567, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.46034691482782364, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.4077945202589035, + "step": 733, + "token_counts/after_target": 824.5, + "token_counts/after_think": 222.0, + "token_counts/before_target": 2113.0, + "token_counts/before_think": 1621.0 + }, + { + "avg_penalty/after_target": 2.168064922094345, + "avg_penalty/after_think": 2.964537560939789, + "avg_penalty/before_target": 0.5694353580474854, + "avg_penalty/before_think": 0.6861945390701294, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 886.75, + "completions/max_terminated_length": 866.0, + "completions/mean_length": 337.765625, + "completions/mean_terminated_length": 326.1750030517578, + "completions/min_length": 102.75, + "completions/min_terminated_length": 102.75, + "epoch": 0.367, + "grad_norm": 14.75118637084961, + "kl": 12.828125, + "learning_rate": 1.5976251469755214e-05, + "loss": 1.4375, + "num_tokens": 24654756.0, + "reward": 1.515625, + "reward_std": 0.8584757149219513, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4440634250640869, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.4208832308650017, + "step": 734, + "token_counts/after_target": 1317.0, + "token_counts/after_think": 159.25, + "token_counts/before_target": 2070.5, + "token_counts/before_think": 1857.5 + }, + { + "avg_penalty/after_target": 2.491993248462677, + "avg_penalty/after_think": 3.6371740102767944, + "avg_penalty/before_target": 0.5422707945108414, + "avg_penalty/before_think": 0.5879713892936707, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 733.5, + "completions/max_terminated_length": 682.75, + "completions/mean_length": 304.21875, + "completions/mean_terminated_length": 292.32083892822266, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.3675, + "grad_norm": 22.606212615966797, + "kl": 10.2578125, + "learning_rate": 1.5962248749656158e-05, + "loss": 1.5127, + "num_tokens": 24683634.0, + "reward": 1.5078125, + "reward_std": 0.8122326284646988, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.3801939934492111, + "step": 735, + "token_counts/after_target": 1123.5, + "token_counts/after_think": 176.25, + "token_counts/before_target": 1748.25, + "token_counts/before_think": 1819.5 + }, + { + "avg_penalty/after_target": 2.6325777769088745, + "avg_penalty/after_think": 2.616072654724121, + "avg_penalty/before_target": 0.37862005829811096, + "avg_penalty/before_think": 0.5714396014809608, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 767.5, + "completions/max_terminated_length": 767.5, + "completions/mean_length": 314.0, + "completions/mean_terminated_length": 314.0, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.368, + "grad_norm": 18.965627670288086, + "kl": 9.76953125, + "learning_rate": 1.5948227867513416e-05, + "loss": 1.3495, + "num_tokens": 24713154.0, + "reward": 1.56640625, + "reward_std": 0.8044364899396896, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.43399807065725327, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.3718005269765854, + "step": 736, + "token_counts/after_target": 1067.5, + "token_counts/after_think": 190.0, + "token_counts/before_target": 2226.0, + "token_counts/before_think": 1540.5 + }, + { + "avg_penalty/after_target": 2.899036943912506, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.5315780229866505, + "avg_penalty/before_think": 0.6138479709625244, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 756.75, + "completions/max_terminated_length": 676.0, + "completions/mean_length": 333.859375, + "completions/mean_terminated_length": 322.84896087646484, + "completions/min_length": 105.75, + "completions/min_terminated_length": 105.75, + "epoch": 0.3685, + "grad_norm": 27.552501678466797, + "kl": 10.265625, + "learning_rate": 1.5934188866037017e-05, + "loss": 1.5649, + "num_tokens": 24744201.0, + "reward": 1.4453125, + "reward_std": 0.8228070884943008, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.45565588772296906, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.3900618329644203, + "step": 737, + "token_counts/after_target": 1438.5, + "token_counts/after_think": 105.5, + "token_counts/before_target": 2017.0, + "token_counts/before_think": 1780.75 + }, + { + "avg_penalty/after_target": 2.083341747522354, + "avg_penalty/after_think": 3.3749342560768127, + "avg_penalty/before_target": 0.37831029295921326, + "avg_penalty/before_think": 0.48970089107751846, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 689.75, + "completions/max_terminated_length": 622.5, + "completions/mean_length": 295.84375, + "completions/mean_terminated_length": 284.69271087646484, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.369, + "grad_norm": 15.264387130737305, + "kl": 8.390625, + "learning_rate": 1.5920131787992198e-05, + "loss": 1.1349, + "num_tokens": 24774239.0, + "reward": 1.57421875, + "reward_std": 0.8210376352071762, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.44721361994743347, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.37430623918771744, + "step": 738, + "token_counts/after_target": 746.5, + "token_counts/after_think": 331.5, + "token_counts/before_target": 2002.75, + "token_counts/before_think": 1652.75 + }, + { + "avg_penalty/after_target": 2.824149429798126, + "avg_penalty/after_think": 3.757977783679962, + "avg_penalty/before_target": 0.3639206029474735, + "avg_penalty/before_think": 0.6360766366124153, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.0, + "completions/max_terminated_length": 634.0, + "completions/mean_length": 310.125, + "completions/mean_terminated_length": 310.125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.3695, + "grad_norm": 19.010601043701172, + "kl": 7.0703125, + "learning_rate": 1.5906056676199256e-05, + "loss": 1.2619, + "num_tokens": 24806135.0, + "reward": 1.6640625, + "reward_std": 0.6810206770896912, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.8671875, + "rewards/tag_count_reward/std": 0.3020508736371994, + "step": 739, + "token_counts/after_target": 871.5, + "token_counts/after_think": 309.75, + "token_counts/before_target": 1890.25, + "token_counts/before_think": 1890.5 + }, + { + "avg_penalty/after_target": 2.133676826953888, + "avg_penalty/after_think": 2.04694664478302, + "avg_penalty/before_target": 0.40079624205827713, + "avg_penalty/before_think": 0.4041357487440109, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.25, + "completions/max_terminated_length": 569.25, + "completions/mean_length": 279.0625, + "completions/mean_terminated_length": 279.0625, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.37, + "grad_norm": 3.148770809173584, + "kl": 20.109375, + "learning_rate": 1.5891963573533424e-05, + "loss": 1.6336, + "num_tokens": 24838331.0, + "reward": 1.29296875, + "reward_std": 0.9743583500385284, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.11967839300632477, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.4939897432923317, + "rewards/tag_count_reward/mean": 0.63671875, + "rewards/tag_count_reward/std": 0.4616027697920799, + "step": 740, + "token_counts/after_target": 979.75, + "token_counts/after_think": 50.5, + "token_counts/before_target": 2184.25, + "token_counts/before_think": 1250.5 + }, + { + "avg_penalty/after_target": 2.512466788291931, + "avg_penalty/after_think": 2.214449256658554, + "avg_penalty/before_target": 0.48748598247766495, + "avg_penalty/before_think": 0.4249720051884651, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 636.5, + "completions/max_terminated_length": 509.5, + "completions/mean_length": 264.25, + "completions/mean_terminated_length": 252.22396087646484, + "completions/min_length": 58.5, + "completions/min_terminated_length": 58.5, + "epoch": 0.3705, + "grad_norm": 8.547486305236816, + "kl": 12.0390625, + "learning_rate": 1.5877852522924733e-05, + "loss": 1.3342, + "num_tokens": 24863467.0, + "reward": 1.7578125, + "reward_std": 0.7840741276741028, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.11967839300632477, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.3780868947505951, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.34711357951164246, + "step": 741, + "token_counts/after_target": 735.25, + "token_counts/after_think": 57.5, + "token_counts/before_target": 1877.25, + "token_counts/before_think": 1558.0 + }, + { + "avg_penalty/after_target": 2.397205114364624, + "avg_penalty/after_think": 3.7983341813087463, + "avg_penalty/before_target": 0.3725458160042763, + "avg_penalty/before_think": 0.6281369626522064, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.5, + "completions/max_terminated_length": 628.5, + "completions/mean_length": 280.90625, + "completions/mean_terminated_length": 280.90625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.371, + "grad_norm": 2.739511251449585, + "kl": 18.875, + "learning_rate": 1.5863723567357892e-05, + "loss": 1.6749, + "num_tokens": 24894917.0, + "reward": 1.42578125, + "reward_std": 0.9078236222267151, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4682852029800415, + "rewards/tag_count_reward/mean": 0.72265625, + "rewards/tag_count_reward/std": 0.44539910554885864, + "step": 742, + "token_counts/after_target": 931.5, + "token_counts/after_think": 87.5, + "token_counts/before_target": 2080.5, + "token_counts/before_think": 1395.0 + }, + { + "avg_penalty/after_target": 1.8867920637130737, + "avg_penalty/after_think": 2.835910975933075, + "avg_penalty/before_target": 0.5044642388820648, + "avg_penalty/before_think": 0.5704278498888016, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.5, + "completions/max_terminated_length": 525.5, + "completions/mean_length": 261.453125, + "completions/mean_terminated_length": 261.453125, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.3715, + "grad_norm": 3.2231972217559814, + "kl": 18.578125, + "learning_rate": 1.584957674987216e-05, + "loss": 1.5892, + "num_tokens": 24924754.0, + "reward": 1.37109375, + "reward_std": 0.8669179677963257, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.46566852182149887, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.4195139855146408, + "step": 743, + "token_counts/after_target": 821.25, + "token_counts/after_think": 110.75, + "token_counts/before_target": 1976.0, + "token_counts/before_think": 1275.25 + }, + { + "avg_penalty/after_target": 2.7320157289505005, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.2881942130625248, + "avg_penalty/before_think": 0.4876077398657799, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.75, + "completions/max_terminated_length": 489.75, + "completions/mean_length": 211.4375, + "completions/mean_terminated_length": 211.4375, + "completions/min_length": 53.5, + "completions/min_terminated_length": 53.5, + "epoch": 0.372, + "grad_norm": 4.2486677169799805, + "kl": 15.1875, + "learning_rate": 1.5835412113561176e-05, + "loss": 1.4991, + "num_tokens": 24947870.0, + "reward": 1.65625, + "reward_std": 0.786555141210556, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3987511098384857, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.37333881855010986, + "step": 744, + "token_counts/after_target": 349.5, + "token_counts/after_think": 195.0, + "token_counts/before_target": 1780.5, + "token_counts/before_think": 1058.0 + }, + { + "avg_penalty/after_target": 2.570281833410263, + "avg_penalty/after_think": 3.999569594860077, + "avg_penalty/before_target": 0.3734628036618233, + "avg_penalty/before_think": 0.5192757770419121, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 240.203125, + "completions/mean_terminated_length": 240.203125, + "completions/min_length": 77.75, + "completions/min_terminated_length": 77.75, + "epoch": 0.3725, + "grad_norm": 11.385233879089355, + "kl": 19.44921875, + "learning_rate": 1.5821229701572897e-05, + "loss": 1.4726, + "num_tokens": 24973019.0, + "reward": 1.55078125, + "reward_std": 0.6601425260305405, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.35956869274377823, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.3161431849002838, + "step": 745, + "token_counts/after_target": 484.75, + "token_counts/after_think": 40.5, + "token_counts/before_target": 2117.0, + "token_counts/before_think": 1201.0 + }, + { + "avg_penalty/after_target": 2.3153961896896362, + "avg_penalty/after_think": 2.9797114729881287, + "avg_penalty/before_target": 0.37905481457710266, + "avg_penalty/before_think": 0.4885300025343895, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.5, + "completions/max_terminated_length": 494.5, + "completions/mean_length": 245.890625, + "completions/mean_terminated_length": 245.890625, + "completions/min_length": 58.5, + "completions/min_terminated_length": 58.5, + "epoch": 0.373, + "grad_norm": 10.0663423538208, + "kl": 19.71875, + "learning_rate": 1.5807029557109398e-05, + "loss": 1.4763, + "num_tokens": 24999460.0, + "reward": 1.5625, + "reward_std": 0.7566939741373062, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.3956565484404564, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.3714214786887169, + "step": 746, + "token_counts/after_target": 536.75, + "token_counts/after_think": 60.0, + "token_counts/before_target": 1802.75, + "token_counts/before_think": 1534.75 + }, + { + "avg_penalty/after_target": 2.91222482919693, + "avg_penalty/after_think": 2.7895312309265137, + "avg_penalty/before_target": 0.4663548842072487, + "avg_penalty/before_think": 0.4351288974285126, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 699.25, + "completions/max_terminated_length": 699.25, + "completions/mean_length": 271.28125, + "completions/mean_terminated_length": 271.28125, + "completions/min_length": 39.25, + "completions/min_terminated_length": 39.25, + "epoch": 0.3735, + "grad_norm": 13.065926551818848, + "kl": 27.15625, + "learning_rate": 1.5792811723426787e-05, + "loss": 2.0058, + "num_tokens": 25029014.0, + "reward": 1.46875, + "reward_std": 0.870427668094635, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.44495995342731476, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.4141460731625557, + "step": 747, + "token_counts/after_target": 989.0, + "token_counts/after_think": 49.25, + "token_counts/before_target": 1958.25, + "token_counts/before_think": 1344.0 + }, + { + "avg_penalty/after_target": 2.135391891002655, + "avg_penalty/after_think": 2.9136104583740234, + "avg_penalty/before_target": 0.42281775921583176, + "avg_penalty/before_think": 0.5445233732461929, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.5, + "completions/max_terminated_length": 558.5, + "completions/mean_length": 228.234375, + "completions/mean_terminated_length": 228.234375, + "completions/min_length": 46.75, + "completions/min_terminated_length": 46.75, + "epoch": 0.374, + "grad_norm": 11.599949836730957, + "kl": 24.0, + "learning_rate": 1.5778576243835055e-05, + "loss": 1.743, + "num_tokens": 25058901.0, + "reward": 1.4296875, + "reward_std": 0.8283244669437408, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.4546433389186859, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.4008542001247406, + "step": 748, + "token_counts/after_target": 583.75, + "token_counts/after_think": 154.75, + "token_counts/before_target": 1680.5, + "token_counts/before_think": 1232.75 + }, + { + "avg_penalty/after_target": 1.9948740601539612, + "avg_penalty/after_think": 2.9018222093582153, + "avg_penalty/before_target": 0.4193163216114044, + "avg_penalty/before_think": 0.6272376179695129, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 640.75, + "completions/max_terminated_length": 601.5, + "completions/mean_length": 249.0625, + "completions/mean_terminated_length": 237.71562957763672, + "completions/min_length": 51.75, + "completions/min_terminated_length": 51.75, + "epoch": 0.3745, + "grad_norm": 14.04477310180664, + "kl": 29.6875, + "learning_rate": 1.5764323161697933e-05, + "loss": 2.121, + "num_tokens": 25089113.0, + "reward": 1.37109375, + "reward_std": 0.84203040599823, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.4079870283603668, + "step": 749, + "token_counts/after_target": 797.0, + "token_counts/after_think": 123.25, + "token_counts/before_target": 2165.5, + "token_counts/before_think": 899.25 + }, + { + "avg_penalty/after_target": 3.1629319190979004, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3541225828230381, + "avg_penalty/before_think": 0.35791854932904243, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 608.25, + "completions/max_terminated_length": 608.25, + "completions/mean_length": 225.984375, + "completions/mean_terminated_length": 225.984375, + "completions/min_length": 69.25, + "completions/min_terminated_length": 69.25, + "epoch": 0.375, + "grad_norm": 3.502704381942749, + "kl": 23.828125, + "learning_rate": 1.575005252043279e-05, + "loss": 1.965, + "num_tokens": 25112136.0, + "reward": 1.4375, + "reward_std": 0.8619273155927658, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4550696536898613, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.41365931928157806, + "step": 750, + "token_counts/after_target": 732.0, + "token_counts/after_think": 38.75, + "token_counts/before_target": 2002.25, + "token_counts/before_think": 842.75 + }, + { + "avg_penalty/after_target": 2.5946610271930695, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.29685092717409134, + "avg_penalty/before_think": 0.555051676928997, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.5, + "completions/max_terminated_length": 421.5, + "completions/mean_length": 214.53125, + "completions/mean_terminated_length": 214.53125, + "completions/min_length": 71.5, + "completions/min_terminated_length": 71.5, + "epoch": 0.3755, + "grad_norm": 3.6497550010681152, + "kl": 17.5, + "learning_rate": 1.573576436351046e-05, + "loss": 1.3605, + "num_tokens": 25135402.0, + "reward": 1.49609375, + "reward_std": 0.8675287812948227, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4515564441680908, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.40950794517993927, + "step": 751, + "token_counts/after_target": 429.25, + "token_counts/after_think": 81.0, + "token_counts/before_target": 1736.0, + "token_counts/before_think": 1186.25 + }, + { + "avg_penalty/after_target": 2.1820785105228424, + "avg_penalty/after_think": 3.875701069831848, + "avg_penalty/before_target": 0.4152674898505211, + "avg_penalty/before_think": 0.4564542807638645, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.0, + "completions/max_terminated_length": 601.0, + "completions/mean_length": 223.046875, + "completions/mean_terminated_length": 223.046875, + "completions/min_length": 80.75, + "completions/min_terminated_length": 80.75, + "epoch": 0.376, + "grad_norm": 10.15235424041748, + "kl": 10.4921875, + "learning_rate": 1.5721458734455164e-05, + "loss": 1.2637, + "num_tokens": 25160877.0, + "reward": 1.7265625, + "reward_std": 0.6405241191387177, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3300696536898613, + "rewards/tag_count_reward/mean": 0.8671875, + "rewards/tag_count_reward/std": 0.3126665949821472, + "step": 752, + "token_counts/after_target": 550.5, + "token_counts/after_think": 88.5, + "token_counts/before_target": 1399.0, + "token_counts/before_think": 1530.75 + }, + { + "avg_penalty/after_target": 2.7979275584220886, + "avg_penalty/after_think": 3.996785283088684, + "avg_penalty/before_target": 0.29274512454867363, + "avg_penalty/before_think": 0.5151457488536835, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.75, + "completions/max_terminated_length": 404.75, + "completions/mean_length": 173.0, + "completions/mean_terminated_length": 173.0, + "completions/min_length": 59.75, + "completions/min_terminated_length": 59.75, + "epoch": 0.3765, + "grad_norm": 8.174013137817383, + "kl": 10.578125, + "learning_rate": 1.570713567684432e-05, + "loss": 1.2152, + "num_tokens": 25182861.0, + "reward": 1.68359375, + "reward_std": 0.7616578638553619, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38336414843797684, + "rewards/tag_count_reward/mean": 0.83984375, + "rewards/tag_count_reward/std": 0.3634957820177078, + "step": 753, + "token_counts/after_target": 276.0, + "token_counts/after_think": 96.5, + "token_counts/before_target": 1405.75, + "token_counts/before_think": 989.75 + }, + { + "avg_penalty/after_target": 2.8898832201957703, + "avg_penalty/after_think": 2.768069803714752, + "avg_penalty/before_target": 0.30771083757281303, + "avg_penalty/before_think": 0.4653989374637604, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.25, + "completions/max_terminated_length": 520.25, + "completions/mean_length": 203.53125, + "completions/mean_terminated_length": 203.53125, + "completions/min_length": 73.5, + "completions/min_terminated_length": 73.5, + "epoch": 0.377, + "grad_norm": 18.961862564086914, + "kl": 6.560546875, + "learning_rate": 1.5692795234308446e-05, + "loss": 1.2379, + "num_tokens": 25206527.0, + "reward": 1.84765625, + "reward_std": 0.4860660582780838, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.29578252136707306, + "rewards/tag_count_reward/mean": 0.94140625, + "rewards/tag_count_reward/std": 0.1910214126110077, + "step": 754, + "token_counts/after_target": 570.25, + "token_counts/after_think": 85.75, + "token_counts/before_target": 1248.5, + "token_counts/before_think": 1352.0 + }, + { + "avg_penalty/after_target": 2.929944783449173, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.26697132736444473, + "avg_penalty/before_think": 0.3913131579756737, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.25, + "completions/max_terminated_length": 541.25, + "completions/mean_length": 179.8125, + "completions/mean_terminated_length": 179.8125, + "completions/min_length": 53.25, + "completions/min_terminated_length": 53.25, + "epoch": 0.3775, + "grad_norm": 15.755415916442871, + "kl": 11.85791015625, + "learning_rate": 1.5678437450531014e-05, + "loss": 1.4546, + "num_tokens": 25231139.0, + "reward": 1.71484375, + "reward_std": 0.578374907374382, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.3133598491549492, + "rewards/tag_count_reward/mean": 0.87109375, + "rewards/tag_count_reward/std": 0.27231772243976593, + "step": 755, + "token_counts/after_target": 489.0, + "token_counts/after_think": 29.0, + "token_counts/before_target": 1542.75, + "token_counts/before_think": 816.25 + }, + { + "avg_penalty/after_target": 2.614773392677307, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.32378578186035156, + "avg_penalty/before_think": 0.40301336720585823, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 212.765625, + "completions/mean_terminated_length": 212.765625, + "completions/min_length": 88.25, + "completions/min_terminated_length": 88.25, + "epoch": 0.378, + "grad_norm": 16.988502502441406, + "kl": 7.5, + "learning_rate": 1.566406236924833e-05, + "loss": 1.1314, + "num_tokens": 25253540.0, + "reward": 1.77734375, + "reward_std": 0.5465720742940903, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.28694770485162735, + "rewards/tag_count_reward/mean": 0.90234375, + "rewards/tag_count_reward/std": 0.2809670716524124, + "step": 756, + "token_counts/after_target": 457.0, + "token_counts/after_think": 98.0, + "token_counts/before_target": 1491.75, + "token_counts/before_think": 1357.5 + }, + { + "avg_penalty/after_target": 2.725245773792267, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.30359191074967384, + "avg_penalty/before_think": 0.4746050462126732, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 163.6875, + "completions/mean_terminated_length": 163.6875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.3785, + "grad_norm": 3.6145763397216797, + "kl": 5.1025390625, + "learning_rate": 1.564967003424938e-05, + "loss": 0.8336, + "num_tokens": 25272784.0, + "reward": 1.85546875, + "reward_std": 0.3447744697332382, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.18616948276758194, + "rewards/tag_count_reward/mean": 0.93359375, + "rewards/tag_count_reward/std": 0.16130317002534866, + "step": 757, + "token_counts/after_target": 312.0, + "token_counts/after_think": 106.5, + "token_counts/before_target": 1289.25, + "token_counts/before_think": 911.25 + }, + { + "avg_penalty/after_target": 2.902764320373535, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.30238477885723114, + "avg_penalty/before_think": 0.4047457277774811, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.25, + "completions/max_terminated_length": 375.25, + "completions/mean_length": 165.140625, + "completions/mean_terminated_length": 165.140625, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.379, + "grad_norm": 6.024137496948242, + "kl": 13.0390625, + "learning_rate": 1.5635260489375714e-05, + "loss": 1.2724, + "num_tokens": 25293225.0, + "reward": 1.66015625, + "reward_std": 0.6773148030042648, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.37366948276758194, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.3149576038122177, + "step": 758, + "token_counts/after_target": 332.75, + "token_counts/after_think": 65.5, + "token_counts/before_target": 1483.0, + "token_counts/before_think": 761.0 + }, + { + "avg_penalty/after_target": 2.889309674501419, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.33910924941301346, + "avg_penalty/before_think": 0.4018559232354164, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.75, + "completions/max_terminated_length": 446.75, + "completions/mean_length": 192.703125, + "completions/mean_terminated_length": 192.703125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.3795, + "grad_norm": 7.451897144317627, + "kl": 22.375, + "learning_rate": 1.5620833778521306e-05, + "loss": 1.6396, + "num_tokens": 25315366.0, + "reward": 1.3984375, + "reward_std": 0.843405470252037, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.4876555874943733, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.3960738033056259, + "step": 759, + "token_counts/after_target": 450.5, + "token_counts/after_think": 81.5, + "token_counts/before_target": 1694.25, + "token_counts/before_think": 857.0 + }, + { + "avg_penalty/after_target": 2.857210487127304, + "avg_penalty/after_think": 3.9439300298690796, + "avg_penalty/before_target": 0.24581585824489594, + "avg_penalty/before_think": 0.5497068241238594, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.5, + "completions/max_terminated_length": 410.5, + "completions/mean_length": 159.609375, + "completions/mean_terminated_length": 159.609375, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.38, + "grad_norm": 7.129571914672852, + "kl": 17.1015625, + "learning_rate": 1.560638994563242e-05, + "loss": 1.4476, + "num_tokens": 25335341.0, + "reward": 1.6484375, + "reward_std": 0.7269202321767807, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4097762927412987, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.3325519412755966, + "step": 760, + "token_counts/after_target": 242.25, + "token_counts/after_think": 61.5, + "token_counts/before_target": 1426.0, + "token_counts/before_think": 824.0 + }, + { + "avg_penalty/after_target": 2.667393773794174, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.36566833034157753, + "avg_penalty/before_think": 0.573754794895649, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.5, + "completions/max_terminated_length": 531.5, + "completions/mean_length": 235.703125, + "completions/mean_terminated_length": 235.703125, + "completions/min_length": 65.75, + "completions/min_terminated_length": 65.75, + "epoch": 0.3805, + "grad_norm": 21.75952911376953, + "kl": 34.75, + "learning_rate": 1.5591929034707468e-05, + "loss": 2.3079, + "num_tokens": 25360682.0, + "reward": 1.078125, + "reward_std": 0.9053762257099152, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.48866813629865646, + "rewards/tag_count_reward/mean": 0.578125, + "rewards/tag_count_reward/std": 0.46234413981437683, + "step": 761, + "token_counts/after_target": 827.75, + "token_counts/after_think": 16.5, + "token_counts/before_target": 2136.0, + "token_counts/before_think": 791.0 + }, + { + "avg_penalty/after_target": 2.8918313682079315, + "avg_penalty/after_think": 3.5491268038749695, + "avg_penalty/before_target": 0.4280971586704254, + "avg_penalty/before_think": 0.4526873603463173, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 608.25, + "completions/max_terminated_length": 608.25, + "completions/mean_length": 201.3125, + "completions/mean_terminated_length": 201.3125, + "completions/min_length": 48.75, + "completions/min_terminated_length": 48.75, + "epoch": 0.381, + "grad_norm": 7.138913154602051, + "kl": 22.265625, + "learning_rate": 1.5577451089796904e-05, + "loss": 1.8931, + "num_tokens": 25382158.0, + "reward": 1.46875, + "reward_std": 0.8025095015764236, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4761601909995079, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.36203011870384216, + "step": 762, + "token_counts/after_target": 680.75, + "token_counts/after_think": 99.0, + "token_counts/before_target": 1523.75, + "token_counts/before_think": 917.5 + }, + { + "avg_penalty/after_target": 2.9808738231658936, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.2523358575999737, + "avg_penalty/before_think": 0.48580431193113327, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.5, + "completions/max_terminated_length": 484.5, + "completions/mean_length": 253.09375, + "completions/mean_terminated_length": 253.09375, + "completions/min_length": 85.5, + "completions/min_terminated_length": 85.5, + "epoch": 0.3815, + "grad_norm": 8.61538314819336, + "kl": 20.734375, + "learning_rate": 1.556295615500305e-05, + "loss": 1.5827, + "num_tokens": 25407828.0, + "reward": 1.38671875, + "reward_std": 0.7933825999498367, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.49345622956752777, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.3523974195122719, + "step": 763, + "token_counts/after_target": 616.25, + "token_counts/after_think": 81.0, + "token_counts/before_target": 1930.75, + "token_counts/before_think": 1421.5 + }, + { + "avg_penalty/after_target": 3.183040142059326, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.38091758266091347, + "avg_penalty/before_think": 0.3367980942130089, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 643.0, + "completions/max_terminated_length": 643.0, + "completions/mean_length": 244.734375, + "completions/mean_terminated_length": 244.734375, + "completions/min_length": 58.25, + "completions/min_terminated_length": 58.25, + "epoch": 0.382, + "grad_norm": 10.475700378417969, + "kl": 30.875, + "learning_rate": 1.5548444274479995e-05, + "loss": 2.3286, + "num_tokens": 25436627.0, + "reward": 1.171875, + "reward_std": 0.9051436632871628, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.53125, + "rewards/format_reward/std": 0.50393907725811, + "rewards/tag_count_reward/mean": 0.640625, + "rewards/tag_count_reward/std": 0.44155777245759964, + "step": 764, + "token_counts/after_target": 1058.25, + "token_counts/after_think": 32.75, + "token_counts/before_target": 2074.25, + "token_counts/before_think": 750.5 + }, + { + "avg_penalty/after_target": 2.7254026532173157, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.34152358025312424, + "avg_penalty/before_think": 0.4997350499033928, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.5, + "completions/max_terminated_length": 622.5, + "completions/mean_length": 275.953125, + "completions/mean_terminated_length": 275.953125, + "completions/min_length": 75.75, + "completions/min_terminated_length": 75.75, + "epoch": 0.3825, + "grad_norm": 6.471933841705322, + "kl": 20.65625, + "learning_rate": 1.553391549243344e-05, + "loss": 1.6002, + "num_tokens": 25463264.0, + "reward": 1.30078125, + "reward_std": 0.8459053337574005, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.500852182507515, + "rewards/tag_count_reward/mean": 0.72265625, + "rewards/tag_count_reward/std": 0.38538698107004166, + "step": 765, + "token_counts/after_target": 687.25, + "token_counts/after_think": 176.25, + "token_counts/before_target": 2369.5, + "token_counts/before_think": 1182.25 + }, + { + "avg_penalty/after_target": 2.5866363048553467, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.47396475821733475, + "avg_penalty/before_think": 0.6475431323051453, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 695.5, + "completions/max_terminated_length": 539.25, + "completions/mean_length": 270.3125, + "completions/mean_terminated_length": 257.5281295776367, + "completions/min_length": 59.5, + "completions/min_terminated_length": 59.5, + "epoch": 0.383, + "grad_norm": 2.919790267944336, + "kl": 21.09375, + "learning_rate": 1.5519369853120584e-05, + "loss": 1.8325, + "num_tokens": 25494164.0, + "reward": 1.30078125, + "reward_std": 0.8917406648397446, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.5018647313117981, + "rewards/tag_count_reward/mean": 0.70703125, + "rewards/tag_count_reward/std": 0.41414710134267807, + "step": 766, + "token_counts/after_target": 1007.75, + "token_counts/after_think": 66.0, + "token_counts/before_target": 2082.5, + "token_counts/before_think": 1168.75 + }, + { + "avg_penalty/after_target": 2.6692358553409576, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.31198466569185257, + "avg_penalty/before_think": 0.7277334704995155, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 706.25, + "completions/max_terminated_length": 706.25, + "completions/mean_length": 307.34375, + "completions/mean_terminated_length": 307.34375, + "completions/min_length": 91.5, + "completions/min_terminated_length": 91.5, + "epoch": 0.3835, + "grad_norm": 9.044387817382812, + "kl": 21.15625, + "learning_rate": 1.5504807400849957e-05, + "loss": 1.5553, + "num_tokens": 25523258.0, + "reward": 1.09765625, + "reward_std": 0.929220050573349, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.50393907725811, + "rewards/tag_count_reward/mean": 0.59765625, + "rewards/tag_count_reward/std": 0.45745280385017395, + "step": 767, + "token_counts/after_target": 789.25, + "token_counts/after_think": 80.0, + "token_counts/before_target": 2922.25, + "token_counts/before_think": 1126.0 + }, + { + "avg_penalty/after_target": 1.811722755432129, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4922761395573616, + "avg_penalty/before_think": 0.8965365290641785, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 774.5, + "completions/max_terminated_length": 704.0, + "completions/mean_length": 315.984375, + "completions/mean_terminated_length": 304.0072937011719, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.384, + "grad_norm": 9.34279727935791, + "kl": 14.453125, + "learning_rate": 1.549022817998132e-05, + "loss": 1.5893, + "num_tokens": 25554601.0, + "reward": 1.4765625, + "reward_std": 0.8155173063278198, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4761601909995079, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.38247257471084595, + "step": 768, + "token_counts/after_target": 957.75, + "token_counts/after_think": 227.75, + "token_counts/before_target": 2867.25, + "token_counts/before_think": 1003.0 + }, + { + "avg_penalty/after_target": 2.619239032268524, + "avg_penalty/after_think": 1.8869569301605225, + "avg_penalty/before_target": 0.4295026771724224, + "avg_penalty/before_think": 0.5766118317842484, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 275.703125, + "completions/mean_terminated_length": 275.703125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.3845, + "grad_norm": 7.652512550354004, + "kl": 10.375, + "learning_rate": 1.5475632234925505e-05, + "loss": 1.1317, + "num_tokens": 25581430.0, + "reward": 1.4765625, + "reward_std": 0.7778189182281494, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4713720977306366, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.3488369770348072, + "step": 769, + "token_counts/after_target": 833.5, + "token_counts/after_think": 31.25, + "token_counts/before_target": 1876.75, + "token_counts/before_think": 1669.75 + }, + { + "avg_penalty/after_target": 2.6878766119480133, + "avg_penalty/after_think": 1.8770460486412048, + "avg_penalty/before_target": 0.3620971664786339, + "avg_penalty/before_think": 0.4642082080245018, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.25, + "completions/max_terminated_length": 480.25, + "completions/mean_length": 247.109375, + "completions/mean_terminated_length": 247.109375, + "completions/min_length": 54.75, + "completions/min_terminated_length": 54.75, + "epoch": 0.385, + "grad_norm": 7.303313732147217, + "kl": 10.90625, + "learning_rate": 1.5461019610144292e-05, + "loss": 1.1271, + "num_tokens": 25606093.0, + "reward": 1.453125, + "reward_std": 0.8361241370439529, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.46566852182149887, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.3891477808356285, + "step": 770, + "token_counts/after_target": 722.5, + "token_counts/after_think": 49.5, + "token_counts/before_target": 2103.5, + "token_counts/before_think": 1078.25 + }, + { + "avg_penalty/after_target": 2.4177666306495667, + "avg_penalty/after_think": 2.905914604663849, + "avg_penalty/before_target": 0.34364571422338486, + "avg_penalty/before_think": 0.6690755486488342, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.75, + "completions/max_terminated_length": 764.75, + "completions/mean_length": 309.78125, + "completions/mean_terminated_length": 309.78125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.3855, + "grad_norm": 4.774855613708496, + "kl": 15.421875, + "learning_rate": 1.5446390350150272e-05, + "loss": 1.4315, + "num_tokens": 25638895.0, + "reward": 1.39453125, + "reward_std": 0.860550582408905, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.47770625352859497, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.4104587361216545, + "step": 771, + "token_counts/after_target": 952.25, + "token_counts/after_think": 171.0, + "token_counts/before_target": 2717.75, + "token_counts/before_think": 1115.5 + }, + { + "avg_penalty/after_target": 3.0168665647506714, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3475932441651821, + "avg_penalty/before_think": 0.4805581644177437, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 264.6875, + "completions/mean_terminated_length": 264.6875, + "completions/min_length": 54.75, + "completions/min_terminated_length": 54.75, + "epoch": 0.386, + "grad_norm": 3.7732200622558594, + "kl": 12.328125, + "learning_rate": 1.5431744499506707e-05, + "loss": 1.1664, + "num_tokens": 25664683.0, + "reward": 1.37109375, + "reward_std": 0.9131665378808975, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.47669370472431183, + "rewards/tag_count_reward/mean": 0.69921875, + "rewards/tag_count_reward/std": 0.4415494278073311, + "step": 772, + "token_counts/after_target": 768.75, + "token_counts/after_think": 115.5, + "token_counts/before_target": 1977.0, + "token_counts/before_think": 1373.75 + }, + { + "avg_penalty/after_target": 2.6363485455513, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.37497764825820923, + "avg_penalty/before_think": 0.8714324682950974, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 619.0, + "completions/max_terminated_length": 619.0, + "completions/mean_length": 278.71875, + "completions/mean_terminated_length": 278.71875, + "completions/min_length": 60.25, + "completions/min_terminated_length": 60.25, + "epoch": 0.3865, + "grad_norm": 5.715466022491455, + "kl": 21.6875, + "learning_rate": 1.54170821028274e-05, + "loss": 1.7638, + "num_tokens": 25692585.0, + "reward": 1.234375, + "reward_std": 0.919980600476265, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.48866813629865646, + "rewards/tag_count_reward/mean": 0.640625, + "rewards/tag_count_reward/std": 0.44249799847602844, + "step": 773, + "token_counts/after_target": 994.5, + "token_counts/after_think": 70.75, + "token_counts/before_target": 2377.75, + "token_counts/before_think": 1016.5 + }, + { + "avg_penalty/after_target": 2.5841881334781647, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3012007959187031, + "avg_penalty/before_think": 0.5050161778926849, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 663.5, + "completions/max_terminated_length": 663.5, + "completions/mean_length": 289.03125, + "completions/mean_terminated_length": 289.03125, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.387, + "grad_norm": 3.8306145668029785, + "kl": 20.734375, + "learning_rate": 1.5402403204776552e-05, + "loss": 1.7154, + "num_tokens": 25721899.0, + "reward": 1.2265625, + "reward_std": 0.9205406606197357, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.5625, + "rewards/format_reward/std": 0.5018647313117981, + "rewards/tag_count_reward/mean": 0.6640625, + "rewards/tag_count_reward/std": 0.44721539318561554, + "step": 774, + "token_counts/after_target": 712.0, + "token_counts/after_think": 182.5, + "token_counts/before_target": 2691.75, + "token_counts/before_think": 1038.25 + }, + { + "avg_penalty/after_target": 2.232199877500534, + "avg_penalty/after_think": 3.7266754508018494, + "avg_penalty/before_target": 0.2845812402665615, + "avg_penalty/before_think": 0.4936584606766701, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 173.328125, + "completions/mean_terminated_length": 173.328125, + "completions/min_length": 37.25, + "completions/min_terminated_length": 37.25, + "epoch": 0.3875, + "grad_norm": 4.964750289916992, + "kl": 18.859375, + "learning_rate": 1.5387707850068633e-05, + "loss": 1.491, + "num_tokens": 25745120.0, + "reward": 1.3984375, + "reward_std": 0.8480055928230286, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.436277836561203, + "rewards/tag_count_reward/mean": 0.7109375, + "rewards/tag_count_reward/std": 0.41811980307102203, + "step": 775, + "token_counts/after_target": 301.0, + "token_counts/after_think": 86.25, + "token_counts/before_target": 1800.0, + "token_counts/before_think": 586.0 + }, + { + "avg_penalty/after_target": 2.1007337272167206, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4320851117372513, + "avg_penalty/before_think": 0.6630340218544006, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.5, + "completions/max_terminated_length": 530.5, + "completions/mean_length": 238.453125, + "completions/mean_terminated_length": 238.453125, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.388, + "grad_norm": 2.054965019226074, + "kl": 11.515625, + "learning_rate": 1.5372996083468242e-05, + "loss": 1.0396, + "num_tokens": 25770509.0, + "reward": 1.4765625, + "reward_std": 0.8736033141613007, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44187305867671967, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.43370652943849564, + "step": 776, + "token_counts/after_target": 621.5, + "token_counts/after_think": 164.25, + "token_counts/before_target": 1823.75, + "token_counts/before_think": 1205.75 + }, + { + "avg_penalty/after_target": 2.7228603959083557, + "avg_penalty/after_think": 2.9747453331947327, + "avg_penalty/before_target": 0.32655755430459976, + "avg_penalty/before_think": 0.5522842928767204, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.75, + "completions/max_terminated_length": 473.75, + "completions/mean_length": 207.328125, + "completions/mean_terminated_length": 207.328125, + "completions/min_length": 20.75, + "completions/min_terminated_length": 20.75, + "epoch": 0.3885, + "grad_norm": 3.344470739364624, + "kl": 18.09375, + "learning_rate": 1.5358267949789968e-05, + "loss": 1.4146, + "num_tokens": 25793074.0, + "reward": 1.3359375, + "reward_std": 0.9329729080200195, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.47770625352859497, + "rewards/tag_count_reward/mean": 0.6796875, + "rewards/tag_count_reward/std": 0.4605128541588783, + "step": 777, + "token_counts/after_target": 595.25, + "token_counts/after_think": 52.25, + "token_counts/before_target": 1649.75, + "token_counts/before_think": 1020.0 + }, + { + "avg_penalty/after_target": 2.4249871373176575, + "avg_penalty/after_think": 3.7274078726768494, + "avg_penalty/before_target": 0.3466746248304844, + "avg_penalty/before_think": 0.473498173058033, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.5, + "completions/max_terminated_length": 652.5, + "completions/mean_length": 267.59375, + "completions/mean_terminated_length": 267.59375, + "completions/min_length": 48.25, + "completions/min_terminated_length": 48.25, + "epoch": 0.389, + "grad_norm": 3.9426283836364746, + "kl": 16.5625, + "learning_rate": 1.5343523493898267e-05, + "loss": 1.3471, + "num_tokens": 25821976.0, + "reward": 1.33984375, + "reward_std": 0.8675166517496109, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.47354350984096527, + "rewards/tag_count_reward/mean": 0.69921875, + "rewards/tag_count_reward/std": 0.4088197723031044, + "step": 778, + "token_counts/after_target": 518.5, + "token_counts/after_think": 127.25, + "token_counts/before_target": 2289.0, + "token_counts/before_think": 1346.75 + }, + { + "avg_penalty/after_target": 2.1336984038352966, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.376501627266407, + "avg_penalty/before_think": 0.5823310315608978, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.75, + "completions/max_terminated_length": 545.75, + "completions/mean_length": 261.515625, + "completions/mean_terminated_length": 261.515625, + "completions/min_length": 47.5, + "completions/min_terminated_length": 47.5, + "epoch": 0.3895, + "grad_norm": 2.306180000305176, + "kl": 14.0546875, + "learning_rate": 1.53287627607073e-05, + "loss": 1.2328, + "num_tokens": 25846521.0, + "reward": 1.39453125, + "reward_std": 0.843768447637558, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.42695631086826324, + "rewards/tag_count_reward/mean": 0.70703125, + "rewards/tag_count_reward/std": 0.4248421937227249, + "step": 779, + "token_counts/after_target": 612.75, + "token_counts/after_think": 179.0, + "token_counts/before_target": 2229.5, + "token_counts/before_think": 1163.0 + }, + { + "avg_penalty/after_target": 2.781758964061737, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4304594546556473, + "avg_penalty/before_think": 0.46441294252872467, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 870.0, + "completions/max_terminated_length": 631.5, + "completions/mean_length": 295.09375, + "completions/mean_terminated_length": 260.36355209350586, + "completions/min_length": 53.75, + "completions/min_terminated_length": 53.75, + "epoch": 0.39, + "grad_norm": 5.29713773727417, + "kl": 24.5, + "learning_rate": 1.531398579518083e-05, + "loss": 2.1581, + "num_tokens": 25875647.0, + "reward": 1.19921875, + "reward_std": 0.9562433809041977, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.4939897432923317, + "rewards/tag_count_reward/mean": 0.62109375, + "rewards/tag_count_reward/std": 0.476992592215538, + "step": 780, + "token_counts/after_target": 1183.25, + "token_counts/after_think": 47.25, + "token_counts/before_target": 2830.25, + "token_counts/before_think": 660.75 + }, + { + "avg_penalty/after_target": 3.2413036823272705, + "avg_penalty/after_think": 2.9719430804252625, + "avg_penalty/before_target": 0.32924503833055496, + "avg_penalty/before_think": 0.45872781425714493, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 672.5, + "completions/max_terminated_length": 672.5, + "completions/mean_length": 255.859375, + "completions/mean_terminated_length": 255.859375, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.3905, + "grad_norm": 11.724968910217285, + "kl": 17.359375, + "learning_rate": 1.529919264233205e-05, + "loss": 1.7282, + "num_tokens": 25903174.0, + "reward": 1.3984375, + "reward_std": 0.9040469974279404, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4704566150903702, + "rewards/tag_count_reward/mean": 0.7109375, + "rewards/tag_count_reward/std": 0.4401415288448334, + "step": 781, + "token_counts/after_target": 897.5, + "token_counts/after_think": 80.5, + "token_counts/before_target": 2236.5, + "token_counts/before_think": 879.25 + }, + { + "avg_penalty/after_target": 2.9382994771003723, + "avg_penalty/after_think": 0.9966492056846619, + "avg_penalty/before_target": 0.3021574281156063, + "avg_penalty/before_think": 0.5089776962995529, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 555.25, + "completions/max_terminated_length": 464.5, + "completions/mean_length": 220.578125, + "completions/mean_terminated_length": 209.09062957763672, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.391, + "grad_norm": 10.791216850280762, + "kl": 20.640625, + "learning_rate": 1.5284383347223473e-05, + "loss": 1.8287, + "num_tokens": 25927179.0, + "reward": 1.26953125, + "reward_std": 0.9344812035560608, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.4788651168346405, + "rewards/tag_count_reward/mean": 0.64453125, + "rewards/tag_count_reward/std": 0.4589909166097641, + "step": 782, + "token_counts/after_target": 679.0, + "token_counts/after_think": 109.25, + "token_counts/before_target": 2129.75, + "token_counts/before_think": 611.25 + }, + { + "avg_penalty/after_target": 2.5865642428398132, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.5009742267429829, + "avg_penalty/before_think": 0.8527604714035988, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 875.25, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 316.890625, + "completions/mean_terminated_length": 290.8072967529297, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.3915, + "grad_norm": 11.63038158416748, + "kl": 28.6875, + "learning_rate": 1.5269557954966777e-05, + "loss": 2.2177, + "num_tokens": 25956292.0, + "reward": 0.76953125, + "reward_std": 0.7211889922618866, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.234375, + "rewards/format_reward/std": 0.4255262687802315, + "rewards/tag_count_reward/mean": 0.53515625, + "rewards/tag_count_reward/std": 0.37616661190986633, + "step": 783, + "token_counts/after_target": 1365.0, + "token_counts/after_think": 44.5, + "token_counts/before_target": 2701.25, + "token_counts/before_think": 959.5 + }, + { + "avg_penalty/after_target": 2.720175415277481, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.45137110725045204, + "avg_penalty/before_think": 0.7126444168388844, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 681.5, + "completions/max_terminated_length": 681.5, + "completions/mean_length": 317.171875, + "completions/mean_terminated_length": 317.171875, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.392, + "grad_norm": 10.540132522583008, + "kl": 24.296875, + "learning_rate": 1.5254716510722678e-05, + "loss": 1.966, + "num_tokens": 25985295.0, + "reward": 0.953125, + "reward_std": 0.75486820936203, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.328125, + "rewards/format_reward/std": 0.4612434431910515, + "rewards/tag_count_reward/mean": 0.625, + "rewards/tag_count_reward/std": 0.3690779581665993, + "step": 784, + "token_counts/after_target": 1331.25, + "token_counts/after_think": 112.5, + "token_counts/before_target": 2914.5, + "token_counts/before_think": 716.5 + }, + { + "avg_penalty/after_target": 2.3082556426525116, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.5105725675821304, + "avg_penalty/before_think": 0.7879667729139328, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 801.75, + "completions/max_terminated_length": 582.5, + "completions/mean_length": 309.6875, + "completions/mean_terminated_length": 286.6479263305664, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.3925, + "grad_norm": 10.174932479858398, + "kl": 21.40625, + "learning_rate": 1.5239859059700794e-05, + "loss": 2.0261, + "num_tokens": 26015643.0, + "reward": 1.15625, + "reward_std": 0.9527851939201355, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.515625, + "rewards/format_reward/std": 0.5153852254152298, + "rewards/tag_count_reward/mean": 0.625, + "rewards/tag_count_reward/std": 0.4501163810491562, + "step": 785, + "token_counts/after_target": 1472.25, + "token_counts/after_think": 42.25, + "token_counts/before_target": 2529.75, + "token_counts/before_think": 910.75 + }, + { + "avg_penalty/after_target": 2.297259271144867, + "avg_penalty/after_think": 2.6971611976623535, + "avg_penalty/before_target": 0.42177218943834305, + "avg_penalty/before_think": 0.4866773337125778, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 692.0, + "completions/max_terminated_length": 692.0, + "completions/mean_length": 272.359375, + "completions/mean_terminated_length": 272.359375, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.393, + "grad_norm": 7.433753967285156, + "kl": 16.578125, + "learning_rate": 1.5224985647159489e-05, + "loss": 1.6668, + "num_tokens": 26046354.0, + "reward": 1.44140625, + "reward_std": 0.8564598262310028, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.46822190284729004, + "rewards/tag_count_reward/mean": 0.75390625, + "rewards/tag_count_reward/std": 0.410181500017643, + "step": 786, + "token_counts/after_target": 786.5, + "token_counts/after_think": 145.0, + "token_counts/before_target": 2244.25, + "token_counts/before_think": 1182.0 + }, + { + "avg_penalty/after_target": 2.744532346725464, + "avg_penalty/after_think": 2.900141417980194, + "avg_penalty/before_target": 0.5017591789364815, + "avg_penalty/before_think": 0.5950889997184277, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 696.75, + "completions/max_terminated_length": 569.75, + "completions/mean_length": 281.28125, + "completions/mean_terminated_length": 270.13854217529297, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.3935, + "grad_norm": 5.833712577819824, + "kl": 20.40625, + "learning_rate": 1.5210096318405768e-05, + "loss": 1.8934, + "num_tokens": 26076740.0, + "reward": 1.328125, + "reward_std": 0.9211724698543549, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.48079314827919006, + "rewards/tag_count_reward/mean": 0.6875, + "rewards/tag_count_reward/std": 0.45677995681762695, + "step": 787, + "token_counts/after_target": 1194.0, + "token_counts/after_think": 48.25, + "token_counts/before_target": 2346.75, + "token_counts/before_think": 911.5 + }, + { + "avg_penalty/after_target": 2.889701098203659, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3256221003830433, + "avg_penalty/before_think": 0.3960225060582161, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 751.25, + "completions/max_terminated_length": 693.75, + "completions/mean_length": 250.0625, + "completions/mean_terminated_length": 238.45938110351562, + "completions/min_length": 71.5, + "completions/min_terminated_length": 71.5, + "epoch": 0.394, + "grad_norm": 3.8676793575286865, + "kl": 16.953125, + "learning_rate": 1.5195191118795095e-05, + "loss": 1.5048, + "num_tokens": 26103336.0, + "reward": 1.4609375, + "reward_std": 0.8626919090747833, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.46513500809669495, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.41633398085832596, + "step": 788, + "token_counts/after_target": 850.25, + "token_counts/after_think": 7.75, + "token_counts/before_target": 2199.75, + "token_counts/before_think": 943.25 + }, + { + "avg_penalty/after_target": 1.9905600249767303, + "avg_penalty/after_think": 3.7687368392944336, + "avg_penalty/before_target": 0.2941829189658165, + "avg_penalty/before_think": 0.5958874002099037, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 207.671875, + "completions/mean_terminated_length": 207.671875, + "completions/min_length": 58.75, + "completions/min_terminated_length": 58.75, + "epoch": 0.3945, + "grad_norm": 3.7367355823516846, + "kl": 15.796875, + "learning_rate": 1.5180270093731305e-05, + "loss": 1.437, + "num_tokens": 26124627.0, + "reward": 1.6171875, + "reward_std": 0.7292187809944153, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4057852029800415, + "rewards/tag_count_reward/mean": 0.8359375, + "rewards/tag_count_reward/std": 0.34117214381694794, + "step": 789, + "token_counts/after_target": 317.5, + "token_counts/after_think": 203.0, + "token_counts/before_target": 1675.5, + "token_counts/before_think": 1126.75 + }, + { + "avg_penalty/after_target": 3.424663245677948, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4197956621646881, + "avg_penalty/before_think": 0.42333027720451355, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 527.0, + "completions/max_terminated_length": 527.0, + "completions/mean_length": 180.6875, + "completions/mean_terminated_length": 180.6875, + "completions/min_length": 24.25, + "completions/min_terminated_length": 24.25, + "epoch": 0.395, + "grad_norm": 4.509520530700684, + "kl": 24.75, + "learning_rate": 1.516533328866642e-05, + "loss": 2.2065, + "num_tokens": 26145679.0, + "reward": 1.4921875, + "reward_std": 0.8181963115930557, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4682852029800415, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.38206514716148376, + "step": 790, + "token_counts/after_target": 750.5, + "token_counts/after_think": 25.75, + "token_counts/before_target": 1471.5, + "token_counts/before_think": 643.25 + }, + { + "avg_penalty/after_target": 2.3846721053123474, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.394909992814064, + "avg_penalty/before_think": 0.5403296239674091, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 708.25, + "completions/max_terminated_length": 708.25, + "completions/mean_length": 246.453125, + "completions/mean_terminated_length": 246.453125, + "completions/min_length": 55.5, + "completions/min_terminated_length": 55.5, + "epoch": 0.3955, + "grad_norm": 23.038761138916016, + "kl": 40.875, + "learning_rate": 1.5150380749100545e-05, + "loss": 2.8843, + "num_tokens": 26170028.0, + "reward": 1.35546875, + "reward_std": 0.9120800346136093, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.4788651168346405, + "rewards/tag_count_reward/mean": 0.69921875, + "rewards/tag_count_reward/std": 0.44453515857458115, + "step": 791, + "token_counts/after_target": 888.25, + "token_counts/after_think": 123.75, + "token_counts/before_target": 2200.25, + "token_counts/before_think": 731.0 + }, + { + "avg_penalty/after_target": 2.58377406001091, + "avg_penalty/after_think": 2.125371605157852, + "avg_penalty/before_target": 0.2845398187637329, + "avg_penalty/before_think": 0.44266846030950546, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.5, + "completions/max_terminated_length": 519.5, + "completions/mean_length": 209.953125, + "completions/mean_terminated_length": 209.953125, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.396, + "grad_norm": 18.189743041992188, + "kl": 28.5, + "learning_rate": 1.5135412520581703e-05, + "loss": 1.9162, + "num_tokens": 26195481.0, + "reward": 1.55078125, + "reward_std": 0.7987058162689209, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4383598491549492, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.37731755524873734, + "step": 792, + "token_counts/after_target": 508.25, + "token_counts/after_think": 39.75, + "token_counts/before_target": 1873.5, + "token_counts/before_think": 937.75 + }, + { + "avg_penalty/after_target": 2.313332825899124, + "avg_penalty/after_think": 1.7366613149642944, + "avg_penalty/before_target": 0.4008464068174362, + "avg_penalty/before_think": 0.5775370448827744, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.75, + "completions/max_terminated_length": 562.75, + "completions/mean_length": 219.5, + "completions/mean_terminated_length": 219.5, + "completions/min_length": 42.75, + "completions/min_terminated_length": 42.75, + "epoch": 0.3965, + "grad_norm": 27.268259048461914, + "kl": 39.46875, + "learning_rate": 1.5120428648705716e-05, + "loss": 2.5814, + "num_tokens": 26218729.0, + "reward": 1.4375, + "reward_std": 0.8795275837182999, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4625816270709038, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.42448264360427856, + "step": 793, + "token_counts/after_target": 742.5, + "token_counts/after_think": 23.0, + "token_counts/before_target": 1862.5, + "token_counts/before_think": 884.0 + }, + { + "avg_penalty/after_target": 2.729707360267639, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3319835998117924, + "avg_penalty/before_think": 0.3226239085197449, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.25, + "completions/max_terminated_length": 450.25, + "completions/mean_length": 189.703125, + "completions/mean_terminated_length": 189.703125, + "completions/min_length": 59.25, + "completions/min_terminated_length": 59.25, + "epoch": 0.397, + "grad_norm": 19.51710319519043, + "kl": 35.375, + "learning_rate": 1.510542917911606e-05, + "loss": 2.3482, + "num_tokens": 26242598.0, + "reward": 1.39453125, + "reward_std": 0.9150842279195786, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4713720977306366, + "rewards/tag_count_reward/mean": 0.70703125, + "rewards/tag_count_reward/std": 0.45288366079330444, + "step": 794, + "token_counts/after_target": 490.5, + "token_counts/after_think": 54.0, + "token_counts/before_target": 1842.25, + "token_counts/before_think": 648.5 + }, + { + "avg_penalty/after_target": 2.493801236152649, + "avg_penalty/after_think": 1.9676145315170288, + "avg_penalty/before_target": 0.3876836895942688, + "avg_penalty/before_think": 0.28697434440255165, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 164.125, + "completions/mean_terminated_length": 164.125, + "completions/min_length": 47.75, + "completions/min_terminated_length": 47.75, + "epoch": 0.3975, + "grad_norm": 8.688580513000488, + "kl": 25.794921875, + "learning_rate": 1.5090414157503715e-05, + "loss": 1.8762, + "num_tokens": 26261534.0, + "reward": 1.57421875, + "reward_std": 0.7632801532745361, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.38879410922527313, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.34191910922527313, + "step": 795, + "token_counts/after_target": 501.25, + "token_counts/after_think": 8.75, + "token_counts/before_target": 1446.0, + "token_counts/before_think": 670.0 + }, + { + "avg_penalty/after_target": 2.9505032300949097, + "avg_penalty/after_think": 2.9211314022541046, + "avg_penalty/before_target": 0.41145193576812744, + "avg_penalty/before_think": 0.5209903493523598, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 668.25, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 220.765625, + "completions/mean_terminated_length": 208.60625457763672, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.398, + "grad_norm": 8.564960479736328, + "kl": 26.90625, + "learning_rate": 1.5075383629607043e-05, + "loss": 2.056, + "num_tokens": 26285055.0, + "reward": 1.5390625, + "reward_std": 0.8087482452392578, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.44091323018074036, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.38711271435022354, + "step": 796, + "token_counts/after_target": 650.5, + "token_counts/after_think": 41.5, + "token_counts/before_target": 1941.25, + "token_counts/before_think": 899.0 + }, + { + "avg_penalty/after_target": 2.059827357530594, + "avg_penalty/after_think": 3.7847801446914673, + "avg_penalty/before_target": 0.28824183344841003, + "avg_penalty/before_think": 0.433058999478817, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 172.390625, + "completions/mean_terminated_length": 172.390625, + "completions/min_length": 47.5, + "completions/min_terminated_length": 47.5, + "epoch": 0.3985, + "grad_norm": 8.421216011047363, + "kl": 29.46875, + "learning_rate": 1.5060337641211637e-05, + "loss": 2.2314, + "num_tokens": 26305416.0, + "reward": 1.51171875, + "reward_std": 0.8203939646482468, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4339347705245018, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.4112183004617691, + "step": 797, + "token_counts/after_target": 275.0, + "token_counts/after_think": 47.75, + "token_counts/before_target": 1763.25, + "token_counts/before_think": 672.25 + }, + { + "avg_penalty/after_target": 2.0738553404808044, + "avg_penalty/after_think": 3.8964919447898865, + "avg_penalty/before_target": 0.3244336508214474, + "avg_penalty/before_think": 0.5264299884438515, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 213.25, + "completions/mean_terminated_length": 213.25, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.399, + "grad_norm": 6.604024887084961, + "kl": 15.65625, + "learning_rate": 1.5045276238150194e-05, + "loss": 1.555, + "num_tokens": 26329272.0, + "reward": 1.66015625, + "reward_std": 0.7113470882177353, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3837348371744156, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.3409239649772644, + "step": 798, + "token_counts/after_target": 375.5, + "token_counts/after_think": 157.5, + "token_counts/before_target": 1975.75, + "token_counts/before_think": 903.25 + }, + { + "avg_penalty/after_target": 2.3844815492630005, + "avg_penalty/after_think": 2.8022525906562805, + "avg_penalty/before_target": 0.41133828833699226, + "avg_penalty/before_think": 0.5164347663521767, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 676.5, + "completions/max_terminated_length": 676.5, + "completions/mean_length": 193.328125, + "completions/mean_terminated_length": 193.328125, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.3995, + "grad_norm": 11.834426879882812, + "kl": 17.296875, + "learning_rate": 1.5030199466302354e-05, + "loss": 1.7713, + "num_tokens": 26350781.0, + "reward": 1.671875, + "reward_std": 0.7323294579982758, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38336414843797684, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.3553176745772362, + "step": 799, + "token_counts/after_target": 493.75, + "token_counts/after_think": 73.0, + "token_counts/before_target": 1569.25, + "token_counts/before_think": 957.25 + }, + { + "avg_penalty/after_target": 2.665812313556671, + "avg_penalty/after_think": 2.626570999622345, + "avg_penalty/before_target": 0.3417444974184036, + "avg_penalty/before_think": 0.5073770582675934, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 606.75, + "completions/max_terminated_length": 490.25, + "completions/mean_length": 196.53125, + "completions/mean_terminated_length": 183.8687515258789, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.4, + "grad_norm": 14.017864227294922, + "kl": 13.3466796875, + "learning_rate": 1.5015107371594576e-05, + "loss": 1.5305, + "num_tokens": 26371551.0, + "reward": 1.70703125, + "reward_std": 0.591723620891571, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.3133598491549492, + "rewards/tag_count_reward/mean": 0.86328125, + "rewards/tag_count_reward/std": 0.2830768972635269, + "step": 800, + "token_counts/after_target": 515.0, + "token_counts/after_think": 63.25, + "token_counts/before_target": 1553.75, + "token_counts/before_think": 1012.5 + }, + { + "avg_penalty/after_target": 2.161913514137268, + "avg_penalty/after_think": 2.7670310139656067, + "avg_penalty/before_target": 0.27644232660532, + "avg_penalty/before_think": 0.4850270226597786, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 555.5, + "completions/max_terminated_length": 459.5, + "completions/mean_length": 175.9375, + "completions/mean_terminated_length": 163.61666870117188, + "completions/min_length": 58.25, + "completions/min_terminated_length": 58.25, + "epoch": 0.4005, + "grad_norm": 6.202334403991699, + "kl": 13.728515625, + "learning_rate": 1.5000000000000002e-05, + "loss": 1.3269, + "num_tokens": 26395643.0, + "reward": 1.69140625, + "reward_std": 0.6645581424236298, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.3683478757739067, + "rewards/tag_count_reward/mean": 0.86328125, + "rewards/tag_count_reward/std": 0.31324903666973114, + "step": 801, + "token_counts/after_target": 364.0, + "token_counts/after_think": 10.75, + "token_counts/before_target": 1488.5, + "token_counts/before_think": 951.75 + }, + { + "avg_penalty/after_target": 1.638723909854889, + "avg_penalty/after_think": 3.728869915008545, + "avg_penalty/before_target": 0.3747611567378044, + "avg_penalty/before_think": 0.43733739107847214, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 734.75, + "completions/max_terminated_length": 734.75, + "completions/mean_length": 213.03125, + "completions/mean_terminated_length": 213.03125, + "completions/min_length": 62.25, + "completions/min_terminated_length": 62.25, + "epoch": 0.401, + "grad_norm": 10.123318672180176, + "kl": 18.71875, + "learning_rate": 1.4984877397538305e-05, + "loss": 1.8714, + "num_tokens": 26420749.0, + "reward": 1.66015625, + "reward_std": 0.7399290949106216, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3987511098384857, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.36191706359386444, + "step": 802, + "token_counts/after_target": 508.75, + "token_counts/after_think": 44.0, + "token_counts/before_target": 1906.75, + "token_counts/before_think": 949.0 + }, + { + "avg_penalty/after_target": 1.8255462348461151, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4046919532120228, + "avg_penalty/before_think": 0.6446873247623444, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 692.75, + "completions/max_terminated_length": 692.75, + "completions/mean_length": 247.828125, + "completions/mean_terminated_length": 247.828125, + "completions/min_length": 56.5, + "completions/min_terminated_length": 56.5, + "epoch": 0.4015, + "grad_norm": 11.967607498168945, + "kl": 9.16015625, + "learning_rate": 1.4969739610275556e-05, + "loss": 1.2176, + "num_tokens": 26445106.0, + "reward": 1.7734375, + "reward_std": 0.5833620727062225, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3265564441680908, + "rewards/tag_count_reward/mean": 0.8984375, + "rewards/tag_count_reward/std": 0.2594592645764351, + "step": 803, + "token_counts/after_target": 633.0, + "token_counts/after_think": 44.75, + "token_counts/before_target": 1370.5, + "token_counts/before_think": 1917.0 + }, + { + "avg_penalty/after_target": 2.419584184885025, + "avg_penalty/after_think": 3.7888631224632263, + "avg_penalty/before_target": 0.37021297216415405, + "avg_penalty/before_think": 0.5183991566300392, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.5, + "completions/max_terminated_length": 553.5, + "completions/mean_length": 240.8125, + "completions/mean_terminated_length": 240.8125, + "completions/min_length": 41.5, + "completions/min_terminated_length": 41.5, + "epoch": 0.402, + "grad_norm": 2.436133861541748, + "kl": 17.6845703125, + "learning_rate": 1.4954586684324077e-05, + "loss": 1.573, + "num_tokens": 26471510.0, + "reward": 1.56640625, + "reward_std": 0.7714317440986633, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.38778156042099, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.3841073513031006, + "step": 804, + "token_counts/after_target": 656.25, + "token_counts/after_think": 208.75, + "token_counts/before_target": 1809.25, + "token_counts/before_think": 1178.75 + }, + { + "avg_penalty/after_target": 2.275200217962265, + "avg_penalty/after_think": 2.7225432991981506, + "avg_penalty/before_target": 0.34379903972148895, + "avg_penalty/before_think": 0.5897568315267563, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 170.4375, + "completions/mean_terminated_length": 170.4375, + "completions/min_length": 44.5, + "completions/min_terminated_length": 44.5, + "epoch": 0.4025, + "grad_norm": 3.570645332336426, + "kl": 17.8671875, + "learning_rate": 1.493941866584231e-05, + "loss": 1.6299, + "num_tokens": 26492658.0, + "reward": 1.6875, + "reward_std": 0.715053990483284, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.3758598491549492, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.3229247257113457, + "step": 805, + "token_counts/after_target": 473.75, + "token_counts/after_think": 19.5, + "token_counts/before_target": 1235.75, + "token_counts/before_think": 998.0 + }, + { + "avg_penalty/after_target": 2.856193959712982, + "avg_penalty/after_think": 2.5664870142936707, + "avg_penalty/before_target": 0.35401833057403564, + "avg_penalty/before_think": 0.3342319056391716, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.25, + "completions/max_terminated_length": 548.25, + "completions/mean_length": 214.953125, + "completions/mean_terminated_length": 214.953125, + "completions/min_length": 40.5, + "completions/min_terminated_length": 40.5, + "epoch": 0.403, + "grad_norm": 6.066196441650391, + "kl": 24.15625, + "learning_rate": 1.4924235601034673e-05, + "loss": 1.9095, + "num_tokens": 26516639.0, + "reward": 1.49609375, + "reward_std": 0.8542595356702805, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44938503205776215, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.4103276804089546, + "step": 806, + "token_counts/after_target": 585.0, + "token_counts/after_think": 30.75, + "token_counts/before_target": 1773.25, + "token_counts/before_think": 1050.25 + }, + { + "avg_penalty/after_target": 2.093466430902481, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4108763486146927, + "avg_penalty/before_think": 0.4201972335577011, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.0, + "completions/max_terminated_length": 690.0, + "completions/mean_length": 212.078125, + "completions/mean_terminated_length": 212.078125, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.4035, + "grad_norm": 5.865439414978027, + "kl": 20.515625, + "learning_rate": 1.490903753615141e-05, + "loss": 1.9696, + "num_tokens": 26540404.0, + "reward": 1.6953125, + "reward_std": 0.6691634953022003, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.3758598491549492, + "rewards/tag_count_reward/mean": 0.8671875, + "rewards/tag_count_reward/std": 0.3012717664241791, + "step": 807, + "token_counts/after_target": 647.75, + "token_counts/after_think": 268.5, + "token_counts/before_target": 1367.25, + "token_counts/before_think": 1109.75 + }, + { + "avg_penalty/after_target": 3.200098395347595, + "avg_penalty/after_think": 3.8447870016098022, + "avg_penalty/before_target": 0.39547089114785194, + "avg_penalty/before_think": 0.556514136493206, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 755.0, + "completions/max_terminated_length": 733.5, + "completions/mean_length": 253.34375, + "completions/mean_terminated_length": 241.7854232788086, + "completions/min_length": 56.5, + "completions/min_terminated_length": 56.5, + "epoch": 0.404, + "grad_norm": 5.662160396575928, + "kl": 25.140625, + "learning_rate": 1.4893824517488464e-05, + "loss": 2.1409, + "num_tokens": 26566026.0, + "reward": 1.5390625, + "reward_std": 0.7593540698289871, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.41419370472431183, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.3679445907473564, + "step": 808, + "token_counts/after_target": 810.75, + "token_counts/after_think": 245.75, + "token_counts/before_target": 2071.0, + "token_counts/before_think": 926.0 + }, + { + "avg_penalty/after_target": 3.0705068111419678, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.2663937956094742, + "avg_penalty/before_think": 0.3110821098089218, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.5, + "completions/max_terminated_length": 445.5, + "completions/mean_length": 156.390625, + "completions/mean_terminated_length": 156.390625, + "completions/min_length": 32.25, + "completions/min_terminated_length": 32.25, + "epoch": 0.4045, + "grad_norm": 6.580674171447754, + "kl": 17.390625, + "learning_rate": 1.4878596591387329e-05, + "loss": 1.375, + "num_tokens": 26587859.0, + "reward": 1.65234375, + "reward_std": 0.6859204471111298, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.36180340498685837, + "rewards/tag_count_reward/mean": 0.83984375, + "rewards/tag_count_reward/std": 0.33116767555475235, + "step": 809, + "token_counts/after_target": 231.5, + "token_counts/after_think": 70.5, + "token_counts/before_target": 1463.0, + "token_counts/before_think": 737.25 + }, + { + "avg_penalty/after_target": 2.8231229186058044, + "avg_penalty/after_think": 3.8508426547050476, + "avg_penalty/before_target": 0.3161960132420063, + "avg_penalty/before_think": 0.4387390539050102, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 184.15625, + "completions/mean_terminated_length": 184.15625, + "completions/min_length": 41.5, + "completions/min_terminated_length": 41.5, + "epoch": 0.405, + "grad_norm": 6.133103370666504, + "kl": 17.3046875, + "learning_rate": 1.4863353804234906e-05, + "loss": 1.4664, + "num_tokens": 26607757.0, + "reward": 1.703125, + "reward_std": 0.6442451477050781, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38336414843797684, + "rewards/tag_count_reward/mean": 0.875, + "rewards/tag_count_reward/std": 0.2864980138838291, + "step": 810, + "token_counts/after_target": 332.0, + "token_counts/after_think": 152.0, + "token_counts/before_target": 1610.75, + "token_counts/before_think": 851.75 + }, + { + "avg_penalty/after_target": 2.0949545800685883, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3947723060846329, + "avg_penalty/before_think": 0.5432385802268982, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.25, + "completions/max_terminated_length": 565.25, + "completions/mean_length": 193.828125, + "completions/mean_terminated_length": 193.828125, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.4055, + "grad_norm": 16.22422981262207, + "kl": 25.171875, + "learning_rate": 1.4848096202463373e-05, + "loss": 1.7287, + "num_tokens": 26628946.0, + "reward": 1.40234375, + "reward_std": 0.8624787032604218, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.48456869274377823, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.4070384204387665, + "step": 811, + "token_counts/after_target": 529.75, + "token_counts/after_think": 63.0, + "token_counts/before_target": 1583.25, + "token_counts/before_think": 925.25 + }, + { + "avg_penalty/after_target": 3.251150608062744, + "avg_penalty/after_think": 3.552269399166107, + "avg_penalty/before_target": 0.25712624937295914, + "avg_penalty/before_think": 0.41940565407276154, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.75, + "completions/max_terminated_length": 450.75, + "completions/mean_length": 190.34375, + "completions/mean_terminated_length": 190.34375, + "completions/min_length": 57.75, + "completions/min_terminated_length": 57.75, + "epoch": 0.406, + "grad_norm": 7.665066242218018, + "kl": 23.875, + "learning_rate": 1.4832823832550025e-05, + "loss": 1.8932, + "num_tokens": 26649656.0, + "reward": 1.625, + "reward_std": 0.7254670560359955, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.32305993512272835, + "step": 812, + "token_counts/after_target": 347.75, + "token_counts/after_think": 89.0, + "token_counts/before_target": 1635.75, + "token_counts/before_think": 973.0 + }, + { + "avg_penalty/after_target": 1.9745172560214996, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.440015260130167, + "avg_penalty/before_think": 0.6076209619641304, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 248.078125, + "completions/mean_terminated_length": 248.078125, + "completions/min_length": 52.5, + "completions/min_terminated_length": 52.5, + "epoch": 0.4065, + "grad_norm": 8.341379165649414, + "kl": 24.359375, + "learning_rate": 1.4817536741017153e-05, + "loss": 1.9248, + "num_tokens": 26674109.0, + "reward": 1.43359375, + "reward_std": 0.8065145164728165, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.47770625352859497, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.36149657517671585, + "step": 813, + "token_counts/after_target": 607.75, + "token_counts/after_think": 142.75, + "token_counts/before_target": 2035.25, + "token_counts/before_think": 1183.5 + }, + { + "avg_penalty/after_target": 2.373710870742798, + "avg_penalty/after_think": 3.7242671847343445, + "avg_penalty/before_target": 0.4420575611293316, + "avg_penalty/before_think": 0.3110385611653328, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 715.75, + "completions/max_terminated_length": 690.5, + "completions/mean_length": 218.359375, + "completions/mean_terminated_length": 205.71667098999023, + "completions/min_length": 43.5, + "completions/min_terminated_length": 43.5, + "epoch": 0.407, + "grad_norm": 6.175736427307129, + "kl": 28.0625, + "learning_rate": 1.480223497443189e-05, + "loss": 2.2195, + "num_tokens": 26700548.0, + "reward": 1.375, + "reward_std": 0.8646570295095444, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.479247085750103, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.4135810658335686, + "step": 814, + "token_counts/after_target": 804.5, + "token_counts/after_think": 64.0, + "token_counts/before_target": 1813.0, + "token_counts/before_think": 812.25 + }, + { + "avg_penalty/after_target": 2.3271699249744415, + "avg_penalty/after_think": 3.8538554310798645, + "avg_penalty/before_target": 0.2600298412144184, + "avg_penalty/before_think": 0.3966691121459007, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.25, + "completions/max_terminated_length": 425.25, + "completions/mean_length": 170.25, + "completions/mean_terminated_length": 170.25, + "completions/min_length": 52.25, + "completions/min_terminated_length": 52.25, + "epoch": 0.4075, + "grad_norm": 2.6952874660491943, + "kl": 16.203125, + "learning_rate": 1.478691857940607e-05, + "loss": 1.3618, + "num_tokens": 26722468.0, + "reward": 1.71875, + "reward_std": 0.6470011174678802, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.125, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3987511098384857, + "rewards/tag_count_reward/mean": 0.875, + "rewards/tag_count_reward/std": 0.26052870601415634, + "step": 815, + "token_counts/after_target": 215.25, + "token_counts/after_think": 130.5, + "token_counts/before_target": 1421.0, + "token_counts/before_think": 957.25 + }, + { + "avg_penalty/after_target": 3.0381964445114136, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.418467678129673, + "avg_penalty/before_think": 0.4024181663990021, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.75, + "completions/max_terminated_length": 460.75, + "completions/mean_length": 196.59375, + "completions/mean_terminated_length": 196.59375, + "completions/min_length": 51.5, + "completions/min_terminated_length": 51.5, + "epoch": 0.408, + "grad_norm": 11.550536155700684, + "kl": 17.53125, + "learning_rate": 1.4771587602596085e-05, + "loss": 1.8333, + "num_tokens": 26744474.0, + "reward": 1.625, + "reward_std": 0.8181928396224976, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4141380712389946, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.3895714581012726, + "step": 816, + "token_counts/after_target": 692.25, + "token_counts/after_think": 63.25, + "token_counts/before_target": 1405.25, + "token_counts/before_think": 984.75 + }, + { + "avg_penalty/after_target": 2.1106087267398834, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3916815370321274, + "avg_penalty/before_think": 0.46996011584997177, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.75, + "completions/max_terminated_length": 558.75, + "completions/mean_length": 193.90625, + "completions/mean_terminated_length": 193.90625, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.4085, + "grad_norm": 7.921955108642578, + "kl": 19.203125, + "learning_rate": 1.4756242090702756e-05, + "loss": 1.8332, + "num_tokens": 26766500.0, + "reward": 1.57421875, + "reward_std": 0.807465597987175, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.44721361994743347, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.3657536581158638, + "step": 817, + "token_counts/after_target": 596.75, + "token_counts/after_think": 138.5, + "token_counts/before_target": 1539.25, + "token_counts/before_think": 828.0 + }, + { + "avg_penalty/after_target": 2.440030872821808, + "avg_penalty/after_think": 1.6695131063461304, + "avg_penalty/before_target": 0.5907834246754646, + "avg_penalty/before_think": 0.6034694910049438, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 851.25, + "completions/max_terminated_length": 730.25, + "completions/mean_length": 283.9375, + "completions/mean_terminated_length": 234.97315979003906, + "completions/min_length": 49.5, + "completions/min_terminated_length": 49.5, + "epoch": 0.409, + "grad_norm": 23.442720413208008, + "kl": 18.5625, + "learning_rate": 1.4740882090471163e-05, + "loss": 2.2577, + "num_tokens": 26795056.0, + "reward": 1.63671875, + "reward_std": 0.6934643089771271, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.85546875, + "rewards/tag_count_reward/std": 0.29290355183184147, + "step": 818, + "token_counts/after_target": 1447.75, + "token_counts/after_think": 24.0, + "token_counts/before_target": 2040.5, + "token_counts/before_think": 1030.75 + }, + { + "avg_penalty/after_target": 2.7654154896736145, + "avg_penalty/after_think": 2.8396854996681213, + "avg_penalty/before_target": 0.4159672372043133, + "avg_penalty/before_think": 0.4854820594191551, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.5, + "completions/max_terminated_length": 602.5, + "completions/mean_length": 191.734375, + "completions/mean_terminated_length": 191.734375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.4095, + "grad_norm": 17.36162567138672, + "kl": 12.140625, + "learning_rate": 1.4725507648690542e-05, + "loss": 1.5729, + "num_tokens": 26818607.0, + "reward": 1.70703125, + "reward_std": 0.7001806348562241, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.3723389655351639, + "rewards/tag_count_reward/mean": 0.86328125, + "rewards/tag_count_reward/std": 0.3397887498140335, + "step": 819, + "token_counts/after_target": 691.75, + "token_counts/after_think": 26.75, + "token_counts/before_target": 1527.75, + "token_counts/before_think": 821.5 + }, + { + "avg_penalty/after_target": 2.4500990509986877, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3615216389298439, + "avg_penalty/before_think": 0.5036794394254684, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 632.25, + "completions/max_terminated_length": 632.25, + "completions/mean_length": 252.390625, + "completions/mean_terminated_length": 252.390625, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.41, + "grad_norm": 4.436160087585449, + "kl": 19.703125, + "learning_rate": 1.47101188121941e-05, + "loss": 1.7173, + "num_tokens": 26845320.0, + "reward": 1.30078125, + "reward_std": 0.8761585503816605, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.49297719448804855, + "rewards/tag_count_reward/mean": 0.70703125, + "rewards/tag_count_reward/std": 0.4206565320491791, + "step": 820, + "token_counts/after_target": 666.25, + "token_counts/after_think": 153.5, + "token_counts/before_target": 2179.25, + "token_counts/before_think": 1039.25 + }, + { + "avg_penalty/after_target": 3.5807193517684937, + "avg_penalty/after_think": 1.9877881407737732, + "avg_penalty/before_target": 0.3401258811354637, + "avg_penalty/before_think": 0.31723441556096077, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.25, + "completions/max_terminated_length": 545.25, + "completions/mean_length": 191.8125, + "completions/mean_terminated_length": 191.8125, + "completions/min_length": 46.25, + "completions/min_terminated_length": 46.25, + "epoch": 0.4105, + "grad_norm": 11.097280502319336, + "kl": 18.2421875, + "learning_rate": 1.469471562785891e-05, + "loss": 1.8193, + "num_tokens": 26865644.0, + "reward": 1.5, + "reward_std": 0.8106341063976288, + "rewards/accuracy_reward/mean": NaN, + "rewards/accuracy_reward/std": NaN, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.3632853329181671, + "step": 821, + "token_counts/after_target": 597.75, + "token_counts/after_think": 77.5, + "token_counts/before_target": 1350.5, + "token_counts/before_think": 1043.25 + }, + { + "avg_penalty/after_target": 2.518712729215622, + "avg_penalty/after_think": 2.680673658847809, + "avg_penalty/before_target": 0.3227451629936695, + "avg_penalty/before_think": 0.37607891112565994, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.75, + "completions/max_terminated_length": 531.75, + "completions/mean_length": 189.921875, + "completions/mean_terminated_length": 189.921875, + "completions/min_length": 38.5, + "completions/min_terminated_length": 38.5, + "epoch": 0.411, + "grad_norm": 2.6372718811035156, + "kl": 20.3203125, + "learning_rate": 1.4679298142605735e-05, + "loss": 1.6661, + "num_tokens": 26891863.0, + "reward": 1.5625, + "reward_std": 0.8956190347671509, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.14789126068353653, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4308478757739067, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.4136316403746605, + "step": 822, + "token_counts/after_target": 615.25, + "token_counts/after_think": 18.0, + "token_counts/before_target": 1802.5, + "token_counts/before_think": 603.0 + }, + { + "avg_penalty/after_target": 2.713769882917404, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3750930577516556, + "avg_penalty/before_think": 0.5087263397872448, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 675.25, + "completions/max_terminated_length": 675.25, + "completions/mean_length": 242.9375, + "completions/mean_terminated_length": 242.9375, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.4115, + "grad_norm": 4.73141622543335, + "kl": 27.125, + "learning_rate": 1.4663866403398915e-05, + "loss": 2.227, + "num_tokens": 26918931.0, + "reward": 1.3984375, + "reward_std": 0.8549593985080719, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.44938503205776215, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.4166818633675575, + "step": 823, + "token_counts/after_target": 953.0, + "token_counts/after_think": 31.75, + "token_counts/before_target": 2187.75, + "token_counts/before_think": 714.5 + }, + { + "avg_penalty/after_target": 2.7278584837913513, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.41489141806960106, + "avg_penalty/before_think": 0.4676142632961273, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.75, + "completions/max_terminated_length": 557.75, + "completions/mean_length": 194.328125, + "completions/mean_terminated_length": 194.328125, + "completions/min_length": 52.25, + "completions/min_terminated_length": 52.25, + "epoch": 0.412, + "grad_norm": 4.223733901977539, + "kl": 24.390625, + "learning_rate": 1.46484204572462e-05, + "loss": 2.0938, + "num_tokens": 26942296.0, + "reward": 1.5703125, + "reward_std": 0.8002886474132538, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4057852029800415, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.3952583745121956, + "step": 824, + "token_counts/after_target": 679.5, + "token_counts/after_think": 39.25, + "token_counts/before_target": 1766.5, + "token_counts/before_think": 624.0 + }, + { + "avg_penalty/after_target": 1.7646002769470215, + "avg_penalty/after_think": 2.7759143114089966, + "avg_penalty/before_target": 0.3081749901175499, + "avg_penalty/before_think": 0.5627307742834091, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.25, + "completions/max_terminated_length": 564.25, + "completions/mean_length": 196.953125, + "completions/mean_terminated_length": 196.953125, + "completions/min_length": 42.75, + "completions/min_terminated_length": 42.75, + "epoch": 0.4125, + "grad_norm": 14.854974746704102, + "kl": 31.0625, + "learning_rate": 1.463296035119862e-05, + "loss": 2.129, + "num_tokens": 26966773.0, + "reward": 1.32421875, + "reward_std": 0.9465876966714859, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.47360680997371674, + "rewards/tag_count_reward/mean": 0.66796875, + "rewards/tag_count_reward/std": 0.4511307030916214, + "step": 825, + "token_counts/after_target": 447.0, + "token_counts/after_think": 9.25, + "token_counts/before_target": 2263.5, + "token_counts/before_think": 431.5 + }, + { + "avg_penalty/after_target": 2.610219419002533, + "avg_penalty/after_think": 1.7752593755722046, + "avg_penalty/before_target": 0.2784786708652973, + "avg_penalty/before_think": 0.39099662005901337, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.5, + "completions/max_terminated_length": 467.5, + "completions/mean_length": 152.40625, + "completions/mean_terminated_length": 152.40625, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.413, + "grad_norm": 3.1476547718048096, + "kl": 20.46875, + "learning_rate": 1.4617486132350343e-05, + "loss": 1.7942, + "num_tokens": 26986879.0, + "reward": 1.60546875, + "reward_std": 0.7605211436748505, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.41194770485162735, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.36867261677980423, + "step": 826, + "token_counts/after_target": 228.0, + "token_counts/after_think": 37.25, + "token_counts/before_target": 1529.25, + "token_counts/before_think": 644.0 + }, + { + "avg_penalty/after_target": 2.667556405067444, + "avg_penalty/after_think": 2.3988423347473145, + "avg_penalty/before_target": 0.37658772245049477, + "avg_penalty/before_think": 0.4180372506380081, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 724.5, + "completions/max_terminated_length": 724.5, + "completions/mean_length": 247.53125, + "completions/mean_terminated_length": 247.53125, + "completions/min_length": 55.75, + "completions/min_terminated_length": 55.75, + "epoch": 0.4135, + "grad_norm": 3.529451370239258, + "kl": 27.0625, + "learning_rate": 1.4601997847838518e-05, + "loss": 2.2439, + "num_tokens": 27013393.0, + "reward": 1.328125, + "reward_std": 0.926904171705246, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.48989029973745346, + "rewards/tag_count_reward/mean": 0.6875, + "rewards/tag_count_reward/std": 0.45030052214860916, + "step": 827, + "token_counts/after_target": 966.75, + "token_counts/after_think": 22.25, + "token_counts/before_target": 2229.5, + "token_counts/before_think": 742.0 + }, + { + "avg_penalty/after_target": 2.384581059217453, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.38110995292663574, + "avg_penalty/before_think": 0.5313112065196037, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 765.0, + "completions/max_terminated_length": 765.0, + "completions/mean_length": 222.71875, + "completions/mean_terminated_length": 222.71875, + "completions/min_length": 39.25, + "completions/min_terminated_length": 39.25, + "epoch": 0.414, + "grad_norm": 4.959833145141602, + "kl": 26.40625, + "learning_rate": 1.4586495544843153e-05, + "loss": 2.2943, + "num_tokens": 27036191.0, + "reward": 1.4140625, + "reward_std": 0.8848461508750916, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.46450965851545334, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.42931779474020004, + "step": 828, + "token_counts/after_target": 920.25, + "token_counts/after_think": 26.25, + "token_counts/before_target": 1787.25, + "token_counts/before_think": 829.75 + }, + { + "avg_penalty/after_target": 2.8323104977607727, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.2874293066561222, + "avg_penalty/before_think": 0.34695763140916824, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.5, + "completions/max_terminated_length": 488.5, + "completions/mean_length": 160.59375, + "completions/mean_terminated_length": 160.59375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.4145, + "grad_norm": 3.833737850189209, + "kl": 22.15625, + "learning_rate": 1.4570979270586944e-05, + "loss": 1.8243, + "num_tokens": 27055061.0, + "reward": 1.46875, + "reward_std": 0.861626535654068, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.44974804669618607, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.4177538827061653, + "step": 829, + "token_counts/after_target": 395.75, + "token_counts/after_think": 50.25, + "token_counts/before_target": 1653.0, + "token_counts/before_think": 470.5 + }, + { + "avg_penalty/after_target": 3.081943690776825, + "avg_penalty/after_think": 2.7606366872787476, + "avg_penalty/before_target": 0.26776332780718803, + "avg_penalty/before_think": 0.3779636360704899, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.25, + "completions/max_terminated_length": 504.25, + "completions/mean_length": 157.1875, + "completions/mean_terminated_length": 157.1875, + "completions/min_length": 30.25, + "completions/min_terminated_length": 30.25, + "epoch": 0.415, + "grad_norm": 7.821195125579834, + "kl": 19.03125, + "learning_rate": 1.4555449072335157e-05, + "loss": 1.7506, + "num_tokens": 27076593.0, + "reward": 1.609375, + "reward_std": 0.7995077967643738, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4141380712389946, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.3911924958229065, + "step": 830, + "token_counts/after_target": 372.5, + "token_counts/after_think": 13.0, + "token_counts/before_target": 1589.25, + "token_counts/before_think": 540.25 + }, + { + "avg_penalty/after_target": 2.456258535385132, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.31676318496465683, + "avg_penalty/before_think": 0.40471697598695755, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.5, + "completions/max_terminated_length": 490.5, + "completions/mean_length": 160.65625, + "completions/mean_terminated_length": 160.65625, + "completions/min_length": 48.25, + "completions/min_terminated_length": 48.25, + "epoch": 0.4155, + "grad_norm": 3.8260488510131836, + "kl": 21.6640625, + "learning_rate": 1.4539904997395468e-05, + "loss": 1.9799, + "num_tokens": 27097547.0, + "reward": 1.6640625, + "reward_std": 0.7208337336778641, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.37149807065725327, + "rewards/tag_count_reward/mean": 0.8359375, + "rewards/tag_count_reward/std": 0.34981294721364975, + "step": 831, + "token_counts/after_target": 436.75, + "token_counts/after_think": 21.0, + "token_counts/before_target": 1675.0, + "token_counts/before_think": 437.75 + }, + { + "avg_penalty/after_target": 2.4873537719249725, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.29865530878305435, + "avg_penalty/before_think": 0.4682833030819893, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.75, + "completions/max_terminated_length": 421.75, + "completions/mean_length": 138.6875, + "completions/mean_terminated_length": 138.6875, + "completions/min_length": 32.25, + "completions/min_terminated_length": 32.25, + "epoch": 0.416, + "grad_norm": 3.7082772254943848, + "kl": 16.15625, + "learning_rate": 1.4524347093117828e-05, + "loss": 1.3606, + "num_tokens": 27113415.0, + "reward": 1.6328125, + "reward_std": 0.7283057570457458, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4022643193602562, + "rewards/tag_count_reward/mean": 0.8359375, + "rewards/tag_count_reward/std": 0.3343529924750328, + "step": 832, + "token_counts/after_target": 295.0, + "token_counts/after_think": 25.5, + "token_counts/before_target": 1250.5, + "token_counts/before_think": 648.0 + }, + { + "avg_penalty/after_target": 2.6400148570537567, + "avg_penalty/after_think": 3.8608400225639343, + "avg_penalty/before_target": 0.40875231102108955, + "avg_penalty/before_think": 0.43590372055768967, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.0, + "completions/max_terminated_length": 614.0, + "completions/mean_length": 163.671875, + "completions/mean_terminated_length": 163.671875, + "completions/min_length": 46.75, + "completions/min_terminated_length": 46.75, + "epoch": 0.4165, + "grad_norm": 10.322114944458008, + "kl": 20.125, + "learning_rate": 1.4508775406894308e-05, + "loss": 2.0794, + "num_tokens": 27134114.0, + "reward": 1.734375, + "reward_std": 0.6672220081090927, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.34944770485162735, + "rewards/tag_count_reward/mean": 0.875, + "rewards/tag_count_reward/std": 0.3201601654291153, + "step": 833, + "token_counts/after_target": 501.5, + "token_counts/after_think": 22.75, + "token_counts/before_target": 1397.5, + "token_counts/before_think": 697.0 + }, + { + "avg_penalty/after_target": 2.0617874562740326, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4591885656118393, + "avg_penalty/before_think": 0.49670955538749695, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.25, + "completions/max_terminated_length": 673.25, + "completions/mean_length": 185.390625, + "completions/mean_terminated_length": 185.390625, + "completions/min_length": 66.5, + "completions/min_terminated_length": 66.5, + "epoch": 0.417, + "grad_norm": 7.31519079208374, + "kl": 25.28125, + "learning_rate": 1.4493189986158966e-05, + "loss": 2.1951, + "num_tokens": 27155787.0, + "reward": 1.6328125, + "reward_std": 0.785381942987442, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3987511098384857, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.3886903375387192, + "step": 834, + "token_counts/after_target": 571.25, + "token_counts/after_think": 56.0, + "token_counts/before_target": 1738.5, + "token_counts/before_think": 600.5 + }, + { + "avg_penalty/after_target": 2.7503997683525085, + "avg_penalty/after_think": 2.8702616691589355, + "avg_penalty/before_target": 0.2403367944061756, + "avg_penalty/before_think": 0.390348844230175, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.75, + "completions/max_terminated_length": 372.75, + "completions/mean_length": 143.46875, + "completions/mean_terminated_length": 143.46875, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.4175, + "grad_norm": 10.930242538452148, + "kl": 25.34375, + "learning_rate": 1.4477590878387697e-05, + "loss": 1.8689, + "num_tokens": 27177961.0, + "reward": 1.5078125, + "reward_std": 0.8222357779741287, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.43708496540784836, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.3963882699608803, + "step": 835, + "token_counts/after_target": 286.0, + "token_counts/after_think": 37.25, + "token_counts/before_target": 1405.25, + "token_counts/before_think": 567.0 + }, + { + "avg_penalty/after_target": 3.0735373497009277, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.2280096746981144, + "avg_penalty/before_think": 0.3651208207011223, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 152.0625, + "completions/mean_terminated_length": 152.0625, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.418, + "grad_norm": 7.44320011138916, + "kl": 25.78125, + "learning_rate": 1.4461978131098089e-05, + "loss": 1.9902, + "num_tokens": 27198157.0, + "reward": 1.68359375, + "reward_std": 0.7775082141160965, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3987511098384857, + "rewards/tag_count_reward/mean": 0.83984375, + "rewards/tag_count_reward/std": 0.3533445745706558, + "step": 836, + "token_counts/after_target": 162.0, + "token_counts/after_think": 21.75, + "token_counts/before_target": 1782.5, + "token_counts/before_think": 466.75 + }, + { + "avg_penalty/after_target": 2.0779473185539246, + "avg_penalty/after_think": 3.984665095806122, + "avg_penalty/before_target": 0.4652332477271557, + "avg_penalty/before_think": 0.5223127380013466, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.75, + "completions/max_terminated_length": 553.75, + "completions/mean_length": 162.828125, + "completions/mean_terminated_length": 162.828125, + "completions/min_length": 51.25, + "completions/min_terminated_length": 51.25, + "epoch": 0.4185, + "grad_norm": 5.734861373901367, + "kl": 23.1806640625, + "learning_rate": 1.4446351791849276e-05, + "loss": 2.0107, + "num_tokens": 27218514.0, + "reward": 1.55859375, + "reward_std": 0.7442358583211899, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4079566150903702, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.3421566039323807, + "step": 837, + "token_counts/after_target": 532.5, + "token_counts/after_think": 67.25, + "token_counts/before_target": 1396.5, + "token_counts/before_think": 609.0 + }, + { + "avg_penalty/after_target": 2.7142900228500366, + "avg_penalty/after_think": 2.97587788105011, + "avg_penalty/before_target": 0.44083530083298683, + "avg_penalty/before_think": 0.635736271739006, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 196.1875, + "completions/mean_terminated_length": 196.1875, + "completions/min_length": 58.25, + "completions/min_terminated_length": 58.25, + "epoch": 0.419, + "grad_norm": 8.74035358428955, + "kl": 19.73046875, + "learning_rate": 1.4430711908241798e-05, + "loss": 2.0486, + "num_tokens": 27239262.0, + "reward": 1.68359375, + "reward_std": 0.6738958060741425, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.35648179799318314, + "rewards/tag_count_reward/mean": 0.85546875, + "rewards/tag_count_reward/std": 0.32428254932165146, + "step": 838, + "token_counts/after_target": 748.75, + "token_counts/after_think": 92.0, + "token_counts/before_target": 1533.5, + "token_counts/before_think": 764.75 + }, + { + "avg_penalty/after_target": 2.360717475414276, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4285440817475319, + "avg_penalty/before_think": 0.6701878383755684, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 728.25, + "completions/max_terminated_length": 638.25, + "completions/mean_length": 256.4375, + "completions/mean_terminated_length": 243.1958351135254, + "completions/min_length": 55.25, + "completions/min_terminated_length": 55.25, + "epoch": 0.4195, + "grad_norm": 36.279457092285156, + "kl": 21.84375, + "learning_rate": 1.4415058527917454e-05, + "loss": 1.8163, + "num_tokens": 27266538.0, + "reward": 1.44921875, + "reward_std": 0.6746709495782852, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.487064003944397, + "rewards/tag_count_reward/mean": 0.85546875, + "rewards/tag_count_reward/std": 0.2605947870761156, + "step": 839, + "token_counts/after_target": 1112.0, + "token_counts/after_think": 148.75, + "token_counts/before_target": 1956.5, + "token_counts/before_think": 885.75 + }, + { + "avg_penalty/after_target": 2.618542969226837, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3783564567565918, + "avg_penalty/before_think": 0.3441651836037636, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 581.75, + "completions/max_terminated_length": 581.75, + "completions/mean_length": 159.1875, + "completions/mean_terminated_length": 159.1875, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.42, + "grad_norm": 5.2610273361206055, + "kl": 23.09375, + "learning_rate": 1.4399391698559153e-05, + "loss": 2.0674, + "num_tokens": 27285686.0, + "reward": 1.55859375, + "reward_std": 0.7803390771150589, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4260597825050354, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.3754643425345421, + "step": 840, + "token_counts/after_target": 463.75, + "token_counts/after_think": 22.25, + "token_counts/before_target": 1563.75, + "token_counts/before_think": 497.25 + }, + { + "avg_penalty/after_target": 2.720553606748581, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.4331942982971668, + "avg_penalty/before_think": 0.49258212745189667, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.5, + "completions/max_terminated_length": 496.5, + "completions/mean_length": 186.09375, + "completions/mean_terminated_length": 186.09375, + "completions/min_length": 60.75, + "completions/min_terminated_length": 60.75, + "epoch": 0.4205, + "grad_norm": 5.599667072296143, + "kl": 15.650390625, + "learning_rate": 1.4383711467890776e-05, + "loss": 1.5813, + "num_tokens": 27306588.0, + "reward": 1.73828125, + "reward_std": 0.5626897066831589, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.2979728877544403, + "rewards/tag_count_reward/mean": 0.87890625, + "rewards/tag_count_reward/std": 0.2685556411743164, + "step": 841, + "token_counts/after_target": 621.25, + "token_counts/after_think": 64.5, + "token_counts/before_target": 1547.5, + "token_counts/before_think": 744.25 + }, + { + "avg_penalty/after_target": 2.568641811609268, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3726687803864479, + "avg_penalty/before_think": 0.5721564441919327, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.75, + "completions/max_terminated_length": 646.75, + "completions/mean_length": 205.296875, + "completions/mean_terminated_length": 205.296875, + "completions/min_length": 63.5, + "completions/min_terminated_length": 63.5, + "epoch": 0.421, + "grad_norm": 5.4489264488220215, + "kl": 20.03125, + "learning_rate": 1.4368017883677024e-05, + "loss": 1.9259, + "num_tokens": 27332111.0, + "reward": 1.6484375, + "reward_std": 0.6875103712081909, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.37675637751817703, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.32536301016807556, + "step": 842, + "token_counts/after_target": 706.25, + "token_counts/after_think": 111.5, + "token_counts/before_target": 1470.75, + "token_counts/before_think": 996.25 + }, + { + "avg_penalty/after_target": 2.491391807794571, + "avg_penalty/after_think": 3.9534042477607727, + "avg_penalty/before_target": 0.33564263954758644, + "avg_penalty/before_think": 0.42989619821310043, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.25, + "completions/max_terminated_length": 500.25, + "completions/mean_length": 134.515625, + "completions/mean_terminated_length": 134.515625, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.4215, + "grad_norm": 4.348962306976318, + "kl": 19.21875, + "learning_rate": 1.4352310993723277e-05, + "loss": 1.8235, + "num_tokens": 27348816.0, + "reward": 1.8359375, + "reward_std": 0.5894620418548584, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.29578252136707306, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.2761673331260681, + "step": 843, + "token_counts/after_target": 295.0, + "token_counts/after_think": 32.5, + "token_counts/before_target": 1316.0, + "token_counts/before_think": 508.75 + }, + { + "avg_penalty/after_target": 2.1715813279151917, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3323713131248951, + "avg_penalty/before_think": 0.49068666249513626, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 165.15625, + "completions/mean_terminated_length": 165.15625, + "completions/min_length": 58.5, + "completions/min_terminated_length": 58.5, + "epoch": 0.422, + "grad_norm": 9.219003677368164, + "kl": 21.453125, + "learning_rate": 1.4336590845875446e-05, + "loss": 1.6974, + "num_tokens": 27369002.0, + "reward": 1.73046875, + "reward_std": 0.6550936847925186, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3375816270709038, + "rewards/tag_count_reward/mean": 0.87109375, + "rewards/tag_count_reward/std": 0.32005760073661804, + "step": 844, + "token_counts/after_target": 299.5, + "token_counts/after_think": 19.0, + "token_counts/before_target": 1486.25, + "token_counts/before_think": 837.75 + }, + { + "avg_penalty/after_target": 2.889122873544693, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.2827858179807663, + "avg_penalty/before_think": 0.3674800246953964, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.75, + "completions/max_terminated_length": 381.75, + "completions/mean_length": 145.234375, + "completions/mean_terminated_length": 145.234375, + "completions/min_length": 50.25, + "completions/min_terminated_length": 50.25, + "epoch": 0.4225, + "grad_norm": 7.093603610992432, + "kl": 18.5625, + "learning_rate": 1.4320857488019826e-05, + "loss": 1.5588, + "num_tokens": 27392777.0, + "reward": 1.7265625, + "reward_std": 0.6247667074203491, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.3454566150903702, + "rewards/tag_count_reward/mean": 0.8828125, + "rewards/tag_count_reward/std": 0.2954637035727501, + "step": 845, + "token_counts/after_target": 298.25, + "token_counts/after_think": 33.25, + "token_counts/before_target": 1427.0, + "token_counts/before_think": 565.25 + }, + { + "avg_penalty/after_target": 2.9919019639492035, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.4092250019311905, + "avg_penalty/before_think": 0.4807750955224037, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 713.25, + "completions/max_terminated_length": 648.25, + "completions/mean_length": 225.6875, + "completions/mean_terminated_length": 212.98020935058594, + "completions/min_length": 47.5, + "completions/min_terminated_length": 47.5, + "epoch": 0.423, + "grad_norm": 12.230194091796875, + "kl": 28.90625, + "learning_rate": 1.4305110968082953e-05, + "loss": 2.1579, + "num_tokens": 27419157.0, + "reward": 1.5625, + "reward_std": 0.789889395236969, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4255262687802315, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.3820813000202179, + "step": 846, + "token_counts/after_target": 797.25, + "token_counts/after_think": 24.25, + "token_counts/before_target": 1689.5, + "token_counts/before_think": 1100.0 + }, + { + "avg_penalty/after_target": 2.948127806186676, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.38584132120013237, + "avg_penalty/before_think": 0.5684700980782509, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 632.25, + "completions/max_terminated_length": 632.25, + "completions/mean_length": 188.65625, + "completions/mean_terminated_length": 188.65625, + "completions/min_length": 54.75, + "completions/min_terminated_length": 54.75, + "epoch": 0.4235, + "grad_norm": 11.01313591003418, + "kl": 27.9375, + "learning_rate": 1.4289351334031461e-05, + "loss": 2.1489, + "num_tokens": 27440191.0, + "reward": 1.5078125, + "reward_std": 0.813549816608429, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.38794170320034027, + "step": 847, + "token_counts/after_target": 697.25, + "token_counts/after_think": 79.25, + "token_counts/before_target": 1578.25, + "token_counts/before_think": 663.75 + }, + { + "avg_penalty/after_target": 3.356153577566147, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.34439991787075996, + "avg_penalty/before_think": 0.45483873039484024, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.25, + "completions/max_terminated_length": 539.25, + "completions/mean_length": 195.375, + "completions/mean_terminated_length": 195.375, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.424, + "grad_norm": 6.809028625488281, + "kl": 25.65625, + "learning_rate": 1.4273578633871927e-05, + "loss": 2.0116, + "num_tokens": 27461399.0, + "reward": 1.54296875, + "reward_std": 0.8134571611881256, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4383598491549492, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.38929233700037, + "step": 848, + "token_counts/after_target": 538.25, + "token_counts/after_think": 151.25, + "token_counts/before_target": 1590.75, + "token_counts/before_think": 845.75 + }, + { + "avg_penalty/after_target": 1.8188077509403229, + "avg_penalty/after_think": 2.8071754574775696, + "avg_penalty/before_target": 0.3210282251238823, + "avg_penalty/before_think": 0.47120480984449387, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.75, + "completions/max_terminated_length": 412.75, + "completions/mean_length": 157.890625, + "completions/mean_terminated_length": 157.890625, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.4245, + "grad_norm": 9.436656951904297, + "kl": 15.9375, + "learning_rate": 1.4257792915650728e-05, + "loss": 1.0783, + "num_tokens": 27483104.0, + "reward": 1.61328125, + "reward_std": 0.7308799996972084, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4101393073797226, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.3389108404517174, + "step": 849, + "token_counts/after_target": 232.0, + "token_counts/after_think": 11.5, + "token_counts/before_target": 1404.25, + "token_counts/before_think": 878.5 + }, + { + "avg_penalty/after_target": 2.294233590364456, + "avg_penalty/after_think": 1.919463038444519, + "avg_penalty/before_target": 0.40008414536714554, + "avg_penalty/before_think": 0.5592648535966873, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 699.25, + "completions/max_terminated_length": 523.75, + "completions/mean_length": 187.453125, + "completions/mean_terminated_length": 173.85937881469727, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.425, + "grad_norm": 6.332860946655273, + "kl": 12.875, + "learning_rate": 1.4241994227453902e-05, + "loss": 1.2706, + "num_tokens": 27505085.0, + "reward": 1.68359375, + "reward_std": 0.6967508792877197, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.40311288833618164, + "rewards/tag_count_reward/mean": 0.87109375, + "rewards/tag_count_reward/std": 0.3164079412817955, + "step": 850, + "token_counts/after_target": 570.75, + "token_counts/after_think": 13.5, + "token_counts/before_target": 1376.25, + "token_counts/before_think": 1038.75 + }, + { + "avg_penalty/after_target": 2.4318183064460754, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.47746072337031364, + "avg_penalty/before_think": 0.5776103436946869, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 625.5, + "completions/max_terminated_length": 533.5, + "completions/mean_length": 226.4375, + "completions/mean_terminated_length": 215.20313262939453, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.4255, + "grad_norm": 4.0711188316345215, + "kl": 24.265625, + "learning_rate": 1.4226182617406996e-05, + "loss": 1.9803, + "num_tokens": 27528201.0, + "reward": 1.3984375, + "reward_std": 0.8399910181760788, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.43835218250751495, + "rewards/tag_count_reward/mean": 0.7109375, + "rewards/tag_count_reward/std": 0.40964654088020325, + "step": 851, + "token_counts/after_target": 850.5, + "token_counts/after_think": 74.5, + "token_counts/before_target": 1803.5, + "token_counts/before_think": 894.5 + }, + { + "avg_penalty/after_target": 2.5786560475826263, + "avg_penalty/after_think": 1.8929929733276367, + "avg_penalty/before_target": 0.2990306504070759, + "avg_penalty/before_think": 0.40217893570661545, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.75, + "completions/max_terminated_length": 553.75, + "completions/mean_length": 186.78125, + "completions/mean_terminated_length": 186.78125, + "completions/min_length": 58.25, + "completions/min_terminated_length": 58.25, + "epoch": 0.426, + "grad_norm": 4.865105152130127, + "kl": 13.65625, + "learning_rate": 1.4210358133674912e-05, + "loss": 1.2752, + "num_tokens": 27550875.0, + "reward": 1.703125, + "reward_std": 0.6798001080751419, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38336414843797684, + "rewards/tag_count_reward/mean": 0.875, + "rewards/tag_count_reward/std": 0.31708232313394547, + "step": 852, + "token_counts/after_target": 393.75, + "token_counts/after_think": 58.5, + "token_counts/before_target": 1470.25, + "token_counts/before_think": 1066.0 + }, + { + "avg_penalty/after_target": 2.0732327699661255, + "avg_penalty/after_think": 3.985112488269806, + "avg_penalty/before_target": 0.4462053030729294, + "avg_penalty/before_think": 0.5833172723650932, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.75, + "completions/max_terminated_length": 625.75, + "completions/mean_length": 244.453125, + "completions/mean_terminated_length": 244.453125, + "completions/min_length": 89.5, + "completions/min_terminated_length": 89.5, + "epoch": 0.4265, + "grad_norm": 4.181101322174072, + "kl": 14.4296875, + "learning_rate": 1.4194520824461773e-05, + "loss": 1.3693, + "num_tokens": 27576440.0, + "reward": 1.64453125, + "reward_std": 0.6394963711500168, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3354102149605751, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.30824608355760574, + "step": 853, + "token_counts/after_target": 653.75, + "token_counts/after_think": 111.25, + "token_counts/before_target": 1829.75, + "token_counts/before_think": 1316.5 + }, + { + "avg_penalty/after_target": 2.138934314250946, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4362773969769478, + "avg_penalty/before_think": 0.6390921622514725, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.25, + "completions/max_terminated_length": 513.25, + "completions/mean_length": 234.765625, + "completions/mean_terminated_length": 234.765625, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.427, + "grad_norm": 3.061905860900879, + "kl": 16.890625, + "learning_rate": 1.4178670738010769e-05, + "loss": 1.484, + "num_tokens": 27600249.0, + "reward": 1.45703125, + "reward_std": 0.8353397697210312, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4503342807292938, + "rewards/tag_count_reward/mean": 0.75390625, + "rewards/tag_count_reward/std": 0.40160205215215683, + "step": 854, + "token_counts/after_target": 576.25, + "token_counts/after_think": 253.25, + "token_counts/before_target": 1929.0, + "token_counts/before_think": 997.75 + }, + { + "avg_penalty/after_target": 3.1450315713882446, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.37216563522815704, + "avg_penalty/before_think": 0.4190458804368973, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.5, + "completions/max_terminated_length": 440.5, + "completions/mean_length": 198.703125, + "completions/mean_terminated_length": 198.703125, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.4275, + "grad_norm": 5.7467427253723145, + "kl": 17.3125, + "learning_rate": 1.4162807922604014e-05, + "loss": 1.6129, + "num_tokens": 27622742.0, + "reward": 1.5859375, + "reward_std": 0.7649120837450027, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.41194770485162735, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.3639799952507019, + "step": 855, + "token_counts/after_target": 652.75, + "token_counts/after_think": 18.75, + "token_counts/before_target": 1784.75, + "token_counts/before_think": 723.0 + }, + { + "avg_penalty/after_target": 2.541220396757126, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3484320640563965, + "avg_penalty/before_think": 0.45338844507932663, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 533.0, + "completions/max_terminated_length": 533.0, + "completions/mean_length": 224.484375, + "completions/mean_terminated_length": 224.484375, + "completions/min_length": 55.75, + "completions/min_terminated_length": 55.75, + "epoch": 0.428, + "grad_norm": 5.953955173492432, + "kl": 16.734375, + "learning_rate": 1.4146932426562391e-05, + "loss": 1.2874, + "num_tokens": 27647157.0, + "reward": 1.36328125, + "reward_std": 0.8667005598545074, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.4776429533958435, + "rewards/tag_count_reward/mean": 0.72265625, + "rewards/tag_count_reward/std": 0.4178609177470207, + "step": 856, + "token_counts/after_target": 477.25, + "token_counts/after_think": 15.25, + "token_counts/before_target": 2061.75, + "token_counts/before_think": 1037.5 + }, + { + "avg_penalty/after_target": 2.4926755130290985, + "avg_penalty/after_think": 3.8231378197669983, + "avg_penalty/before_target": 0.3280954509973526, + "avg_penalty/before_think": 0.481276772916317, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 205.4375, + "completions/mean_terminated_length": 205.4375, + "completions/min_length": 46.25, + "completions/min_terminated_length": 46.25, + "epoch": 0.4285, + "grad_norm": 4.615208148956299, + "kl": 16.34375, + "learning_rate": 1.413104429824542e-05, + "loss": 1.3297, + "num_tokens": 27670881.0, + "reward": 1.375, + "reward_std": 0.8288979679346085, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.4761601909995079, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.39436208456754684, + "step": 857, + "token_counts/after_target": 434.0, + "token_counts/after_think": 26.5, + "token_counts/before_target": 2042.5, + "token_counts/before_think": 784.0 + }, + { + "avg_penalty/after_target": 2.6883760690689087, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4488324299454689, + "avg_penalty/before_think": 0.6287405341863632, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 663.75, + "completions/max_terminated_length": 663.75, + "completions/mean_length": 269.65625, + "completions/mean_terminated_length": 269.65625, + "completions/min_length": 66.5, + "completions/min_terminated_length": 66.5, + "epoch": 0.429, + "grad_norm": 10.068245887756348, + "kl": 11.5, + "learning_rate": 1.411514358605109e-05, + "loss": 1.3503, + "num_tokens": 27701643.0, + "reward": 1.62109375, + "reward_std": 0.7559653073549271, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4022643193602562, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.3702537342905998, + "step": 858, + "token_counts/after_target": 1073.0, + "token_counts/after_think": 90.75, + "token_counts/before_target": 1929.5, + "token_counts/before_think": 1221.25 + }, + { + "avg_penalty/after_target": 2.5140041410923004, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4762294292449951, + "avg_penalty/before_think": 0.6452196165919304, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 817.25, + "completions/max_terminated_length": 638.75, + "completions/mean_length": 290.859375, + "completions/mean_terminated_length": 257.2959899902344, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.4295, + "grad_norm": 4.584686756134033, + "kl": 24.78125, + "learning_rate": 1.4099230338415728e-05, + "loss": 2.0496, + "num_tokens": 27736370.0, + "reward": 1.16796875, + "reward_std": 0.8882443010807037, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.48148179799318314, + "rewards/tag_count_reward/mean": 0.62109375, + "rewards/tag_count_reward/std": 0.4338522255420685, + "step": 859, + "token_counts/after_target": 1278.0, + "token_counts/after_think": 56.0, + "token_counts/before_target": 2450.0, + "token_counts/before_think": 869.75 + }, + { + "avg_penalty/after_target": 2.323924571275711, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.405794158577919, + "avg_penalty/before_think": 0.48834675550460815, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 646.75, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 287.609375, + "completions/mean_terminated_length": 276.4572982788086, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.43, + "grad_norm": 8.52755355834961, + "kl": 16.9375, + "learning_rate": 1.408330460381385e-05, + "loss": 1.2483, + "num_tokens": 27764777.0, + "reward": 1.359375, + "reward_std": 0.8822936713695526, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.4955305755138397, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.42393381893634796, + "step": 860, + "token_counts/after_target": 691.5, + "token_counts/after_think": 146.5, + "token_counts/before_target": 2487.25, + "token_counts/before_think": 1276.5 + }, + { + "avg_penalty/after_target": 2.6097516119480133, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.48502472043037415, + "avg_penalty/before_think": 0.4639838859438896, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.75, + "completions/max_terminated_length": 763.75, + "completions/mean_length": 278.4375, + "completions/mean_terminated_length": 278.4375, + "completions/min_length": 45.75, + "completions/min_terminated_length": 45.75, + "epoch": 0.4305, + "grad_norm": 4.13874626159668, + "kl": 15.609375, + "learning_rate": 1.4067366430758004e-05, + "loss": 1.4874, + "num_tokens": 27791733.0, + "reward": 1.25390625, + "reward_std": 0.8876608163118362, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.5625, + "rewards/format_reward/std": 0.4909028485417366, + "rewards/tag_count_reward/mean": 0.69140625, + "rewards/tag_count_reward/std": 0.44298309832811356, + "step": 861, + "token_counts/after_target": 1140.25, + "token_counts/after_think": 26.75, + "token_counts/before_target": 2267.0, + "token_counts/before_think": 1021.0 + }, + { + "avg_penalty/after_target": 2.709449976682663, + "avg_penalty/after_think": 2.8112685680389404, + "avg_penalty/before_target": 0.44344430416822433, + "avg_penalty/before_think": 0.604419119656086, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.0, + "completions/max_terminated_length": 647.0, + "completions/mean_length": 285.015625, + "completions/mean_terminated_length": 285.015625, + "completions/min_length": 66.25, + "completions/min_terminated_length": 66.25, + "epoch": 0.431, + "grad_norm": 5.779355525970459, + "kl": 14.6875, + "learning_rate": 1.4051415867798627e-05, + "loss": 1.4482, + "num_tokens": 27822470.0, + "reward": 1.4140625, + "reward_std": 0.8773753494024277, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.479247085750103, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.434623621404171, + "step": 862, + "token_counts/after_target": 981.75, + "token_counts/after_think": 247.25, + "token_counts/before_target": 2011.5, + "token_counts/before_think": 1319.75 + }, + { + "avg_penalty/after_target": 2.543175846338272, + "avg_penalty/after_think": 2.940591275691986, + "avg_penalty/before_target": 0.4281017929315567, + "avg_penalty/before_think": 0.5922511070966721, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 668.25, + "completions/max_terminated_length": 649.0, + "completions/mean_length": 298.984375, + "completions/mean_terminated_length": 288.8979187011719, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.4315, + "grad_norm": 8.190215110778809, + "kl": 15.828125, + "learning_rate": 1.4035452963523903e-05, + "loss": 1.6518, + "num_tokens": 27851205.0, + "reward": 1.46875, + "reward_std": 0.8432001024484634, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4682852029800415, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.39295003563165665, + "step": 863, + "token_counts/after_target": 1099.75, + "token_counts/after_think": 117.25, + "token_counts/before_target": 2359.5, + "token_counts/before_think": 1207.25 + }, + { + "avg_penalty/after_target": 2.4607058465480804, + "avg_penalty/after_think": 2.953328400850296, + "avg_penalty/before_target": 0.34808648005127907, + "avg_penalty/before_think": 0.7060301154851913, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 774.75, + "completions/max_terminated_length": 639.75, + "completions/mean_length": 246.828125, + "completions/mean_terminated_length": 234.78229522705078, + "completions/min_length": 48.25, + "completions/min_terminated_length": 48.25, + "epoch": 0.432, + "grad_norm": 10.946245193481445, + "kl": 25.0625, + "learning_rate": 1.4019477766559604e-05, + "loss": 1.8609, + "num_tokens": 27877114.0, + "reward": 1.1953125, + "reward_std": 0.9190218150615692, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.4840351790189743, + "rewards/tag_count_reward/mean": 0.6171875, + "rewards/tag_count_reward/std": 0.44241177290678024, + "step": 864, + "token_counts/after_target": 809.5, + "token_counts/after_think": 95.0, + "token_counts/before_target": 2333.0, + "token_counts/before_think": 711.75 + }, + { + "avg_penalty/after_target": 2.5093097388744354, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4524102061986923, + "avg_penalty/before_think": 0.6001399159431458, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.25, + "completions/max_terminated_length": 541.25, + "completions/mean_length": 231.734375, + "completions/mean_terminated_length": 231.734375, + "completions/min_length": 70.25, + "completions/min_terminated_length": 70.25, + "epoch": 0.4325, + "grad_norm": 5.803097724914551, + "kl": 16.8125, + "learning_rate": 1.4003490325568953e-05, + "loss": 1.4646, + "num_tokens": 27901609.0, + "reward": 1.3984375, + "reward_std": 0.88588547706604, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.47354350984096527, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.4290331229567528, + "step": 865, + "token_counts/after_target": 923.0, + "token_counts/after_think": 59.5, + "token_counts/before_target": 1821.0, + "token_counts/before_think": 904.25 + }, + { + "avg_penalty/after_target": 2.003779023885727, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.2921587750315666, + "avg_penalty/before_think": 0.5384876728057861, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 572.25, + "completions/max_terminated_length": 482.5, + "completions/mean_length": 200.5625, + "completions/mean_terminated_length": 187.64479446411133, + "completions/min_length": 38.5, + "completions/min_terminated_length": 38.5, + "epoch": 0.433, + "grad_norm": 15.915315628051758, + "kl": 22.71875, + "learning_rate": 1.3987490689252463e-05, + "loss": 1.4148, + "num_tokens": 27926701.0, + "reward": 1.18359375, + "reward_std": 0.9532683342695236, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.48605145514011383, + "rewards/tag_count_reward/mean": 0.60546875, + "rewards/tag_count_reward/std": 0.4769538789987564, + "step": 866, + "token_counts/after_target": 457.0, + "token_counts/after_think": 33.5, + "token_counts/before_target": 1964.0, + "token_counts/before_think": 754.5 + }, + { + "avg_penalty/after_target": 1.7962811589241028, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4606577232480049, + "avg_penalty/before_think": 0.5225026085972786, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 583.0, + "completions/max_terminated_length": 476.75, + "completions/mean_length": 247.265625, + "completions/mean_terminated_length": 235.28334045410156, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.4335, + "grad_norm": 9.73831558227539, + "kl": 25.21875, + "learning_rate": 1.3971478906347806e-05, + "loss": 1.8567, + "num_tokens": 27953006.0, + "reward": 1.2421875, + "reward_std": 0.9313626438379288, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.49500229209661484, + "rewards/tag_count_reward/mean": 0.6484375, + "rewards/tag_count_reward/std": 0.4481755420565605, + "step": 867, + "token_counts/after_target": 755.5, + "token_counts/after_think": 29.25, + "token_counts/before_target": 2468.75, + "token_counts/before_think": 702.75 + }, + { + "avg_penalty/after_target": 2.5185903310775757, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.43431153148412704, + "avg_penalty/before_think": 0.6478067338466644, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 709.0, + "completions/max_terminated_length": 588.5, + "completions/mean_length": 270.546875, + "completions/mean_terminated_length": 257.4072952270508, + "completions/min_length": 70.5, + "completions/min_terminated_length": 70.5, + "epoch": 0.434, + "grad_norm": 5.27189826965332, + "kl": 26.125, + "learning_rate": 1.3955455025629652e-05, + "loss": 2.1249, + "num_tokens": 27981217.0, + "reward": 1.24609375, + "reward_std": 0.9474621415138245, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.49776528775691986, + "rewards/tag_count_reward/mean": 0.65234375, + "rewards/tag_count_reward/std": 0.47417640686035156, + "step": 868, + "token_counts/after_target": 1081.0, + "token_counts/after_think": 16.0, + "token_counts/before_target": 2411.5, + "token_counts/before_think": 820.25 + }, + { + "avg_penalty/after_target": 2.789496123790741, + "avg_penalty/after_think": 3.675918757915497, + "avg_penalty/before_target": 0.2942761033773422, + "avg_penalty/before_think": 0.5654862225055695, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 705.25, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 233.90625, + "completions/mean_terminated_length": 221.7760467529297, + "completions/min_length": 69.25, + "completions/min_terminated_length": 69.25, + "epoch": 0.4345, + "grad_norm": 5.183332443237305, + "kl": 18.25, + "learning_rate": 1.3939419095909513e-05, + "loss": 1.7378, + "num_tokens": 28007147.0, + "reward": 1.5234375, + "reward_std": 0.8545179814100266, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4440634250640869, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.4205036014318466, + "step": 869, + "token_counts/after_target": 702.0, + "token_counts/after_think": 56.25, + "token_counts/before_target": 2199.5, + "token_counts/before_think": 784.75 + }, + { + "avg_penalty/after_target": 2.321689009666443, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.344707690179348, + "avg_penalty/before_think": 0.40276525169610977, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 708.0, + "completions/max_terminated_length": 708.0, + "completions/mean_length": 241.96875, + "completions/mean_terminated_length": 241.96875, + "completions/min_length": 70.5, + "completions/min_terminated_length": 70.5, + "epoch": 0.435, + "grad_norm": 2.5815846920013428, + "kl": 22.125, + "learning_rate": 1.3923371166035615e-05, + "loss": 1.7942, + "num_tokens": 28034889.0, + "reward": 1.390625, + "reward_std": 0.914295494556427, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.46566852182149887, + "rewards/tag_count_reward/mean": 0.703125, + "rewards/tag_count_reward/std": 0.4513040855526924, + "step": 870, + "token_counts/after_target": 726.75, + "token_counts/after_think": 4.0, + "token_counts/before_target": 2349.0, + "token_counts/before_think": 791.75 + }, + { + "avg_penalty/after_target": 2.7734594345092773, + "avg_penalty/after_think": 2.848968744277954, + "avg_penalty/before_target": 0.42393046990036964, + "avg_penalty/before_think": 0.6568097323179245, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 684.5, + "completions/max_terminated_length": 632.0, + "completions/mean_length": 232.21875, + "completions/mean_terminated_length": 220.32083892822266, + "completions/min_length": 45.25, + "completions/min_terminated_length": 45.25, + "epoch": 0.4355, + "grad_norm": 6.861468315124512, + "kl": 17.5, + "learning_rate": 1.3907311284892737e-05, + "loss": 1.6595, + "num_tokens": 28060023.0, + "reward": 1.40625, + "reward_std": 0.8837426900863647, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4704566150903702, + "rewards/tag_count_reward/mean": 0.71875, + "rewards/tag_count_reward/std": 0.4217309206724167, + "step": 871, + "token_counts/after_target": 869.0, + "token_counts/after_think": 39.75, + "token_counts/before_target": 2064.75, + "token_counts/before_think": 742.0 + }, + { + "avg_penalty/after_target": 2.6726302206516266, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.47720039263367653, + "avg_penalty/before_think": 0.6200348660349846, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 610.25, + "completions/max_terminated_length": 610.25, + "completions/mean_length": 217.375, + "completions/mean_terminated_length": 217.375, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.436, + "grad_norm": 11.785319328308105, + "kl": 14.921875, + "learning_rate": 1.3891239501402063e-05, + "loss": 1.6557, + "num_tokens": 28083503.0, + "reward": 1.51953125, + "reward_std": 0.7848801463842392, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.39964763820171356, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.38897372782230377, + "step": 872, + "token_counts/after_target": 876.25, + "token_counts/after_think": 57.0, + "token_counts/before_target": 1810.0, + "token_counts/before_think": 734.75 + }, + { + "avg_penalty/after_target": 3.0391354858875275, + "avg_penalty/after_think": 3.7722517251968384, + "avg_penalty/before_target": 0.21402283012866974, + "avg_penalty/before_think": 0.5364055186510086, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 642.5, + "completions/max_terminated_length": 546.5, + "completions/mean_length": 186.65625, + "completions/mean_terminated_length": 173.83854293823242, + "completions/min_length": 69.25, + "completions/min_terminated_length": 69.25, + "epoch": 0.4365, + "grad_norm": 6.96510648727417, + "kl": 15.328125, + "learning_rate": 1.3875155864521031e-05, + "loss": 1.5251, + "num_tokens": 28107913.0, + "reward": 1.58984375, + "reward_std": 0.7583417147397995, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.41503459960222244, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.3308092951774597, + "step": 873, + "token_counts/after_target": 512.25, + "token_counts/after_think": 29.75, + "token_counts/before_target": 1699.25, + "token_counts/before_think": 745.25 + }, + { + "avg_penalty/after_target": 2.339417517185211, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3947811983525753, + "avg_penalty/before_think": 0.6600294560194016, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 218.015625, + "completions/mean_terminated_length": 218.015625, + "completions/min_length": 70.75, + "completions/min_terminated_length": 70.75, + "epoch": 0.437, + "grad_norm": 6.076087951660156, + "kl": 8.6923828125, + "learning_rate": 1.3859060423243187e-05, + "loss": 1.0438, + "num_tokens": 28131482.0, + "reward": 1.69921875, + "reward_std": 0.5370319783687592, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.2759781554341316, + "rewards/tag_count_reward/mean": 0.85546875, + "rewards/tag_count_reward/std": 0.26638131588697433, + "step": 874, + "token_counts/after_target": 823.75, + "token_counts/after_think": 41.25, + "token_counts/before_target": 1559.25, + "token_counts/before_think": 1064.0 + }, + { + "avg_penalty/after_target": 2.9287087321281433, + "avg_penalty/after_think": 2.5204412937164307, + "avg_penalty/before_target": 0.2763471454381943, + "avg_penalty/before_think": 0.5100396797060966, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 655.0, + "completions/max_terminated_length": 655.0, + "completions/mean_length": 204.65625, + "completions/mean_terminated_length": 204.65625, + "completions/min_length": 48.5, + "completions/min_terminated_length": 48.5, + "epoch": 0.4375, + "grad_norm": 2.3974812030792236, + "kl": 12.72265625, + "learning_rate": 1.3842953226598036e-05, + "loss": 1.0851, + "num_tokens": 28154148.0, + "reward": 1.5703125, + "reward_std": 0.6556922644376755, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.35169370472431183, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.3154975697398186, + "step": 875, + "token_counts/after_target": 264.75, + "token_counts/after_think": 68.75, + "token_counts/before_target": 1919.25, + "token_counts/before_think": 1021.75 + }, + { + "avg_penalty/after_target": 2.1216216683387756, + "avg_penalty/after_think": 2.70710152387619, + "avg_penalty/before_target": 0.379309568554163, + "avg_penalty/before_think": 0.4914890453219414, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.75, + "completions/max_terminated_length": 531.75, + "completions/mean_length": 172.6875, + "completions/mean_terminated_length": 172.6875, + "completions/min_length": 43.25, + "completions/min_terminated_length": 43.25, + "epoch": 0.438, + "grad_norm": 2.8078975677490234, + "kl": 18.109375, + "learning_rate": 1.3826834323650899e-05, + "loss": 1.4805, + "num_tokens": 28177856.0, + "reward": 1.4765625, + "reward_std": 0.8691465854644775, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.43008168041706085, + "step": 876, + "token_counts/after_target": 508.0, + "token_counts/after_think": 27.75, + "token_counts/before_target": 1564.25, + "token_counts/before_think": 663.0 + }, + { + "avg_penalty/after_target": 3.2612134218215942, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3216549940407276, + "avg_penalty/before_think": 0.4353679344058037, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.75, + "completions/max_terminated_length": 407.75, + "completions/mean_length": 154.546875, + "completions/mean_terminated_length": 154.546875, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.4385, + "grad_norm": 3.1507484912872314, + "kl": 16.3125, + "learning_rate": 1.3810703763502744e-05, + "loss": 1.4376, + "num_tokens": 28197603.0, + "reward": 1.546875, + "reward_std": 0.8443699777126312, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.43303824216127396, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.4164556637406349, + "step": 877, + "token_counts/after_target": 434.5, + "token_counts/after_think": 13.0, + "token_counts/before_target": 1557.5, + "token_counts/before_think": 467.75 + }, + { + "avg_penalty/after_target": 2.0266568660736084, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4060707539319992, + "avg_penalty/before_think": 0.4447743855416775, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 660.25, + "completions/max_terminated_length": 660.25, + "completions/mean_length": 219.671875, + "completions/mean_terminated_length": 219.671875, + "completions/min_length": 51.5, + "completions/min_terminated_length": 51.5, + "epoch": 0.439, + "grad_norm": 6.822876453399658, + "kl": 15.8984375, + "learning_rate": 1.3794561595290053e-05, + "loss": 1.2251, + "num_tokens": 28220686.0, + "reward": 1.45703125, + "reward_std": 0.8634690493345261, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4682852029800415, + "rewards/tag_count_reward/mean": 0.75390625, + "rewards/tag_count_reward/std": 0.4114953577518463, + "step": 878, + "token_counts/after_target": 657.5, + "token_counts/after_think": 32.0, + "token_counts/before_target": 1678.5, + "token_counts/before_think": 1146.75 + }, + { + "avg_penalty/after_target": 2.645691156387329, + "avg_penalty/after_think": 1.8111258149147034, + "avg_penalty/before_target": 0.24251514673233032, + "avg_penalty/before_think": 0.41542959213256836, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 174.53125, + "completions/mean_terminated_length": 174.53125, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.4395, + "grad_norm": 6.879358291625977, + "kl": 15.3125, + "learning_rate": 1.3778407868184674e-05, + "loss": 1.0903, + "num_tokens": 28243248.0, + "reward": 1.49609375, + "reward_std": 0.7890335470438004, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4163651168346405, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.3822452798485756, + "step": 879, + "token_counts/after_target": 306.0, + "token_counts/after_think": 34.75, + "token_counts/before_target": 1697.5, + "token_counts/before_think": 754.25 + }, + { + "avg_penalty/after_target": 2.5586535334587097, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.2834148593246937, + "avg_penalty/before_think": 0.48718955367803574, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.25, + "completions/max_terminated_length": 551.25, + "completions/mean_length": 181.859375, + "completions/mean_terminated_length": 181.859375, + "completions/min_length": 33.75, + "completions/min_terminated_length": 33.75, + "epoch": 0.44, + "grad_norm": 8.78133773803711, + "kl": 10.66015625, + "learning_rate": 1.3762242631393656e-05, + "loss": 0.6536, + "num_tokens": 28265415.0, + "reward": 1.55078125, + "reward_std": 0.7985986173152924, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.41110680997371674, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.3940122053027153, + "step": 880, + "token_counts/after_target": 316.25, + "token_counts/after_think": 25.0, + "token_counts/before_target": 1939.25, + "token_counts/before_think": 629.25 + }, + { + "avg_penalty/after_target": 2.7476646900177, + "avg_penalty/after_think": 3.977950930595398, + "avg_penalty/before_target": 0.3906528651714325, + "avg_penalty/before_think": 0.5444916039705276, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 570.25, + "completions/max_terminated_length": 570.25, + "completions/mean_length": 158.15625, + "completions/mean_terminated_length": 158.15625, + "completions/min_length": 36.25, + "completions/min_terminated_length": 36.25, + "epoch": 0.4405, + "grad_norm": 6.123039245605469, + "kl": 12.8359375, + "learning_rate": 1.3746065934159123e-05, + "loss": 1.3224, + "num_tokens": 28284833.0, + "reward": 1.55078125, + "reward_std": 0.781711533665657, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4308478757739067, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.3829060047864914, + "step": 881, + "token_counts/after_target": 465.25, + "token_counts/after_think": 109.5, + "token_counts/before_target": 1342.25, + "token_counts/before_think": 613.5 + }, + { + "avg_penalty/after_target": 2.711698293685913, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3293752446770668, + "avg_penalty/before_think": 0.4845479428768158, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.75, + "completions/max_terminated_length": 463.75, + "completions/mean_length": 170.546875, + "completions/mean_terminated_length": 170.546875, + "completions/min_length": 50.25, + "completions/min_terminated_length": 50.25, + "epoch": 0.441, + "grad_norm": 4.668527603149414, + "kl": 11.7109375, + "learning_rate": 1.3729877825758091e-05, + "loss": 1.1412, + "num_tokens": 28305556.0, + "reward": 1.60546875, + "reward_std": 0.7609438002109528, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.39476002007722855, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.3729308694601059, + "step": 882, + "token_counts/after_target": 392.0, + "token_counts/after_think": 24.25, + "token_counts/before_target": 1629.25, + "token_counts/before_think": 683.25 + }, + { + "avg_penalty/after_target": 3.4474933743476868, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3718871399760246, + "avg_penalty/before_think": 0.5085670948028564, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.75, + "completions/max_terminated_length": 421.75, + "completions/mean_length": 130.453125, + "completions/mean_terminated_length": 130.453125, + "completions/min_length": 40.75, + "completions/min_terminated_length": 40.75, + "epoch": 0.4415, + "grad_norm": 8.86090087890625, + "kl": 14.6953125, + "learning_rate": 1.371367835550235e-05, + "loss": 1.5605, + "num_tokens": 28325537.0, + "reward": 1.59375, + "reward_std": 0.7693893015384674, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.3846946656703949, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.3846946656703949, + "step": 883, + "token_counts/after_target": 413.75, + "token_counts/after_think": 50.25, + "token_counts/before_target": 1175.5, + "token_counts/before_think": 447.75 + }, + { + "avg_penalty/after_target": 2.4823061525821686, + "avg_penalty/after_think": 2.891494929790497, + "avg_penalty/before_target": 0.38196447491645813, + "avg_penalty/before_think": 0.5871096327900887, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 781.75, + "completions/max_terminated_length": 660.75, + "completions/mean_length": 189.859375, + "completions/mean_terminated_length": 176.2437515258789, + "completions/min_length": 47.75, + "completions/min_terminated_length": 47.75, + "epoch": 0.442, + "grad_norm": 4.693525314331055, + "kl": 18.484375, + "learning_rate": 1.3697467572738294e-05, + "loss": 1.6015, + "num_tokens": 28347544.0, + "reward": 1.5, + "reward_std": 0.8818264603614807, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.44091323018074036, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.44091323018074036, + "step": 884, + "token_counts/after_target": 692.5, + "token_counts/after_think": 126.0, + "token_counts/before_target": 1607.0, + "token_counts/before_think": 612.25 + }, + { + "avg_penalty/after_target": 2.7327885031700134, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3455358035862446, + "avg_penalty/before_think": 0.7125380784273148, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.25, + "completions/max_terminated_length": 537.25, + "completions/mean_length": 204.4375, + "completions/mean_terminated_length": 204.4375, + "completions/min_length": 34.5, + "completions/min_terminated_length": 34.5, + "epoch": 0.4425, + "grad_norm": 5.912520885467529, + "kl": 6.865234375, + "learning_rate": 1.3681245526846782e-05, + "loss": 0.793, + "num_tokens": 28369828.0, + "reward": 1.6328125, + "reward_std": 0.7627186477184296, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.39123913645744324, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.38139040768146515, + "step": 885, + "token_counts/after_target": 617.25, + "token_counts/after_think": 113.25, + "token_counts/before_target": 1504.25, + "token_counts/before_think": 1036.25 + }, + { + "avg_penalty/after_target": 2.435311406850815, + "avg_penalty/after_think": 3.9880074858665466, + "avg_penalty/before_target": 0.35255926102399826, + "avg_penalty/before_think": 0.6587579250335693, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.75, + "completions/max_terminated_length": 582.75, + "completions/mean_length": 227.96875, + "completions/mean_terminated_length": 227.96875, + "completions/min_length": 59.75, + "completions/min_terminated_length": 59.75, + "epoch": 0.443, + "grad_norm": 4.827857971191406, + "kl": 17.2578125, + "learning_rate": 1.3665012267242974e-05, + "loss": 1.4923, + "num_tokens": 28395314.0, + "reward": 1.609375, + "reward_std": 0.7770551890134811, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.38153913617134094, + "step": 886, + "token_counts/after_target": 602.75, + "token_counts/after_think": 130.75, + "token_counts/before_target": 2036.5, + "token_counts/before_think": 877.5 + }, + { + "avg_penalty/after_target": 2.952914237976074, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.316231869161129, + "avg_penalty/before_think": 0.503338947892189, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 575.5, + "completions/max_terminated_length": 575.5, + "completions/mean_length": 189.125, + "completions/mean_terminated_length": 189.125, + "completions/min_length": 36.5, + "completions/min_terminated_length": 36.5, + "epoch": 0.4435, + "grad_norm": 7.610786437988281, + "kl": 17.703125, + "learning_rate": 1.3648767843376196e-05, + "loss": 1.2688, + "num_tokens": 28416570.0, + "reward": 1.4140625, + "reward_std": 0.9086247682571411, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4634971097111702, + "rewards/tag_count_reward/mean": 0.7109375, + "rewards/tag_count_reward/std": 0.4508437439799309, + "step": 887, + "token_counts/after_target": 564.0, + "token_counts/after_think": 35.5, + "token_counts/before_target": 1723.5, + "token_counts/before_think": 703.0 + }, + { + "avg_penalty/after_target": 2.4381262063980103, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.40056654065847397, + "avg_penalty/before_think": 0.5427215471863747, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 215.125, + "completions/mean_terminated_length": 215.125, + "completions/min_length": 26.25, + "completions/min_terminated_length": 26.25, + "epoch": 0.444, + "grad_norm": 3.2299575805664062, + "kl": 11.9375, + "learning_rate": 1.3632512304729786e-05, + "loss": 0.9914, + "num_tokens": 28440082.0, + "reward": 1.546875, + "reward_std": 0.8212533295154572, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4383598491549492, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.3978056088089943, + "step": 888, + "token_counts/after_target": 673.0, + "token_counts/after_think": 80.25, + "token_counts/before_target": 1487.0, + "token_counts/before_think": 1201.75 + }, + { + "avg_penalty/after_target": 2.7613448202610016, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.40628447011113167, + "avg_penalty/before_think": 0.5096017122268677, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.5, + "completions/max_terminated_length": 539.5, + "completions/mean_length": 213.703125, + "completions/mean_terminated_length": 213.703125, + "completions/min_length": 48.75, + "completions/min_terminated_length": 48.75, + "epoch": 0.4445, + "grad_norm": 4.678145408630371, + "kl": 17.75390625, + "learning_rate": 1.3616245700820922e-05, + "loss": 1.6804, + "num_tokens": 28462383.0, + "reward": 1.63671875, + "reward_std": 0.6299134492874146, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.3432852029800415, + "rewards/tag_count_reward/mean": 0.83984375, + "rewards/tag_count_reward/std": 0.3054095432162285, + "step": 889, + "token_counts/after_target": 939.5, + "token_counts/after_think": 65.0, + "token_counts/before_target": 1742.75, + "token_counts/before_think": 672.0 + }, + { + "avg_penalty/after_target": 2.204716444015503, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.4060225114226341, + "avg_penalty/before_think": 0.539106473326683, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.0, + "completions/max_terminated_length": 582.0, + "completions/mean_length": 237.078125, + "completions/mean_terminated_length": 237.078125, + "completions/min_length": 54.25, + "completions/min_terminated_length": 54.25, + "epoch": 0.445, + "grad_norm": 6.254549980163574, + "kl": 20.515625, + "learning_rate": 1.3599968081200515e-05, + "loss": 1.6086, + "num_tokens": 28486612.0, + "reward": 1.53515625, + "reward_std": 0.8060239851474762, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44938503205776215, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.38119929283857346, + "step": 890, + "token_counts/after_target": 837.25, + "token_counts/after_think": 22.75, + "token_counts/before_target": 1915.5, + "token_counts/before_think": 1017.75 + }, + { + "avg_penalty/after_target": 2.356177031993866, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3658176064491272, + "avg_penalty/before_think": 0.44459206610918045, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.5, + "completions/max_terminated_length": 481.5, + "completions/mean_length": 199.390625, + "completions/mean_terminated_length": 199.390625, + "completions/min_length": 54.75, + "completions/min_terminated_length": 54.75, + "epoch": 0.4455, + "grad_norm": 3.8672425746917725, + "kl": 12.109375, + "learning_rate": 1.3583679495453e-05, + "loss": 1.0551, + "num_tokens": 28507501.0, + "reward": 1.58984375, + "reward_std": 0.7529961466789246, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4255262687802315, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.3467573747038841, + "step": 891, + "token_counts/after_target": 442.5, + "token_counts/after_think": 115.25, + "token_counts/before_target": 1635.0, + "token_counts/before_think": 997.5 + }, + { + "avg_penalty/after_target": 2.862457036972046, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3884703740477562, + "avg_penalty/before_think": 0.504605233669281, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 649.75, + "completions/max_terminated_length": 649.75, + "completions/mean_length": 265.953125, + "completions/mean_terminated_length": 265.953125, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.446, + "grad_norm": 3.4909942150115967, + "kl": 19.25, + "learning_rate": 1.3567379993196252e-05, + "loss": 1.6356, + "num_tokens": 28533482.0, + "reward": 1.5859375, + "reward_std": 0.7112511992454529, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4106728211045265, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.33289606869220734, + "step": 892, + "token_counts/after_target": 784.75, + "token_counts/after_think": 98.5, + "token_counts/before_target": 2186.5, + "token_counts/before_think": 1185.5 + }, + { + "avg_penalty/after_target": 2.568478047847748, + "avg_penalty/after_think": 3.956317365169525, + "avg_penalty/before_target": 0.3068803511559963, + "avg_penalty/before_think": 0.5270242094993591, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.75, + "completions/max_terminated_length": 561.75, + "completions/mean_length": 258.203125, + "completions/mean_terminated_length": 258.203125, + "completions/min_length": 58.5, + "completions/min_terminated_length": 58.5, + "epoch": 0.4465, + "grad_norm": 6.5558180809021, + "kl": 20.390625, + "learning_rate": 1.3551069624081372e-05, + "loss": 1.5943, + "num_tokens": 28558791.0, + "reward": 1.453125, + "reward_std": 0.8397673070430756, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4713720977306366, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.4017672911286354, + "step": 893, + "token_counts/after_target": 695.25, + "token_counts/after_think": 128.25, + "token_counts/before_target": 2049.75, + "token_counts/before_think": 1258.0 + }, + { + "avg_penalty/after_target": 2.7599453032016754, + "avg_penalty/after_think": 3.8918033242225647, + "avg_penalty/before_target": 0.2955208234488964, + "avg_penalty/before_think": 0.43982334434986115, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 554.5, + "completions/max_terminated_length": 554.5, + "completions/mean_length": 275.171875, + "completions/mean_terminated_length": 275.171875, + "completions/min_length": 65.75, + "completions/min_terminated_length": 65.75, + "epoch": 0.447, + "grad_norm": 3.1700427532196045, + "kl": 13.5859375, + "learning_rate": 1.3534748437792573e-05, + "loss": 1.1071, + "num_tokens": 28586738.0, + "reward": 1.61328125, + "reward_std": 0.7221298813819885, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4000816270709038, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.34331195056438446, + "step": 894, + "token_counts/after_target": 456.5, + "token_counts/after_think": 102.25, + "token_counts/before_target": 2005.0, + "token_counts/before_think": 1839.0 + }, + { + "avg_penalty/after_target": 2.277685761451721, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.45969048887491226, + "avg_penalty/before_think": 0.5404075309634209, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 663.0, + "completions/max_terminated_length": 571.0, + "completions/mean_length": 255.84375, + "completions/mean_terminated_length": 242.95312881469727, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.4475, + "grad_norm": 6.853213787078857, + "kl": 17.4921875, + "learning_rate": 1.3518416484047018e-05, + "loss": 1.6862, + "num_tokens": 28615080.0, + "reward": 1.64453125, + "reward_std": 0.6561687886714935, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.3890564441680908, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.27929775416851044, + "step": 895, + "token_counts/after_target": 891.25, + "token_counts/after_think": 37.0, + "token_counts/before_target": 1930.75, + "token_counts/before_think": 1234.5 + }, + { + "avg_penalty/after_target": 2.203952819108963, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.42867370694875717, + "avg_penalty/before_think": 0.4638734869658947, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 708.0, + "completions/max_terminated_length": 708.0, + "completions/mean_length": 291.25, + "completions/mean_terminated_length": 291.25, + "completions/min_length": 45.75, + "completions/min_terminated_length": 45.75, + "epoch": 0.448, + "grad_norm": 5.796187400817871, + "kl": 25.125, + "learning_rate": 1.3502073812594677e-05, + "loss": 1.9197, + "num_tokens": 28645240.0, + "reward": 1.26171875, + "reward_std": 0.9442601799964905, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.5030868947505951, + "rewards/tag_count_reward/mean": 0.65234375, + "rewards/tag_count_reward/std": 0.470088429749012, + "step": 896, + "token_counts/after_target": 1036.0, + "token_counts/after_think": 114.25, + "token_counts/before_target": 2692.5, + "token_counts/before_think": 817.25 + }, + { + "avg_penalty/after_target": 2.578750729560852, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4673622250556946, + "avg_penalty/before_think": 0.4438391253352165, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 766.75, + "completions/max_terminated_length": 617.5, + "completions/mean_length": 261.1875, + "completions/mean_terminated_length": 248.89479446411133, + "completions/min_length": 54.75, + "completions/min_terminated_length": 54.75, + "epoch": 0.4485, + "grad_norm": 10.59843635559082, + "kl": 17.671875, + "learning_rate": 1.3485720473218153e-05, + "loss": 1.7723, + "num_tokens": 28670548.0, + "reward": 1.4140625, + "reward_std": 0.8517706990242004, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.48935678601264954, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.3946138843894005, + "step": 897, + "token_counts/after_target": 955.5, + "token_counts/after_think": 100.75, + "token_counts/before_target": 1711.5, + "token_counts/before_think": 1411.25 + }, + { + "avg_penalty/after_target": 2.812227487564087, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3770855702459812, + "avg_penalty/before_think": 0.5888780616223812, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 655.0, + "completions/max_terminated_length": 655.0, + "completions/mean_length": 339.453125, + "completions/mean_terminated_length": 339.453125, + "completions/min_length": 83.75, + "completions/min_terminated_length": 83.75, + "epoch": 0.449, + "grad_norm": 12.8622407913208, + "kl": 11.9140625, + "learning_rate": 1.3469356515732559e-05, + "loss": 1.4586, + "num_tokens": 28702449.0, + "reward": 1.46484375, + "reward_std": 0.708775982260704, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.42430340498685837, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.3381448835134506, + "step": 898, + "token_counts/after_target": 1163.0, + "token_counts/after_think": 228.5, + "token_counts/before_target": 1876.75, + "token_counts/before_think": 2163.0 + }, + { + "avg_penalty/after_target": 1.8588674664497375, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.564252994954586, + "avg_penalty/before_think": 0.5951411798596382, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 869.25, + "completions/max_terminated_length": 681.0, + "completions/mean_length": 320.53125, + "completions/mean_terminated_length": 299.21875762939453, + "completions/min_length": 54.75, + "completions/min_terminated_length": 54.75, + "epoch": 0.4495, + "grad_norm": 8.90394401550293, + "kl": 17.484375, + "learning_rate": 1.3452981989985347e-05, + "loss": 1.7414, + "num_tokens": 28735699.0, + "reward": 1.484375, + "reward_std": 0.9478929936885834, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.1905868947505951, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.4697679653763771, + "rewards/tag_count_reward/mean": 0.703125, + "rewards/tag_count_reward/std": 0.42083193361759186, + "step": 899, + "token_counts/after_target": 1309.0, + "token_counts/after_think": 133.5, + "token_counts/before_target": 2296.5, + "token_counts/before_think": 1389.5 + }, + { + "avg_penalty/after_target": 2.094171702861786, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3475494831800461, + "avg_penalty/before_think": 0.4768485873937607, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.25, + "completions/max_terminated_length": 518.25, + "completions/mean_length": 212.484375, + "completions/mean_terminated_length": 212.484375, + "completions/min_length": 40.5, + "completions/min_terminated_length": 40.5, + "epoch": 0.45, + "grad_norm": 5.070446968078613, + "kl": 13.734375, + "learning_rate": 1.3436596945856164e-05, + "loss": 1.3351, + "num_tokens": 28760098.0, + "reward": 1.53515625, + "reward_std": 0.8773428350687027, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44938503205776215, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.41879353672266006, + "step": 900, + "token_counts/after_target": 476.0, + "token_counts/after_think": 95.5, + "token_counts/before_target": 2084.0, + "token_counts/before_think": 744.25 + }, + { + "avg_penalty/after_target": 2.415354073047638, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4458158388733864, + "avg_penalty/before_think": 0.496553435921669, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 635.0, + "completions/max_terminated_length": 506.25, + "completions/mean_length": 223.71875, + "completions/mean_terminated_length": 210.5093765258789, + "completions/min_length": 61.75, + "completions/min_terminated_length": 61.75, + "epoch": 0.4505, + "grad_norm": 7.925318717956543, + "kl": 13.484375, + "learning_rate": 1.342020143325669e-05, + "loss": 1.4123, + "num_tokens": 28785360.0, + "reward": 1.515625, + "reward_std": 0.7905955910682678, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.45283494144678116, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.3657883480191231, + "step": 901, + "token_counts/after_target": 663.25, + "token_counts/after_think": 60.75, + "token_counts/before_target": 1846.0, + "token_counts/before_think": 1009.5 + }, + { + "avg_penalty/after_target": 1.7915496528148651, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4309007339179516, + "avg_penalty/before_think": 0.5028593316674232, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 617.25, + "completions/max_terminated_length": 617.25, + "completions/mean_length": 275.046875, + "completions/mean_terminated_length": 275.046875, + "completions/min_length": 51.5, + "completions/min_terminated_length": 51.5, + "epoch": 0.451, + "grad_norm": 2.466311454772949, + "kl": 18.84375, + "learning_rate": 1.3403795502130503e-05, + "loss": 1.6071, + "num_tokens": 28814435.0, + "reward": 1.25390625, + "reward_std": 0.852387547492981, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.4819520115852356, + "rewards/tag_count_reward/mean": 0.70703125, + "rewards/tag_count_reward/std": 0.4198809117078781, + "step": 902, + "token_counts/after_target": 774.75, + "token_counts/after_think": 150.75, + "token_counts/before_target": 2191.0, + "token_counts/before_think": 1284.25 + }, + { + "avg_penalty/after_target": 2.4875659346580505, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.33325550705194473, + "avg_penalty/before_think": 0.4930916950106621, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.25, + "completions/max_terminated_length": 550.25, + "completions/mean_length": 202.453125, + "completions/mean_terminated_length": 202.453125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.4515, + "grad_norm": 5.61767053604126, + "kl": 9.265625, + "learning_rate": 1.3387379202452917e-05, + "loss": 0.9818, + "num_tokens": 28837632.0, + "reward": 1.62109375, + "reward_std": 0.7291287779808044, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4101393073797226, + "rewards/tag_count_reward/mean": 0.83984375, + "rewards/tag_count_reward/std": 0.343844935297966, + "step": 903, + "token_counts/after_target": 513.25, + "token_counts/after_think": 77.5, + "token_counts/before_target": 1355.25, + "token_counts/before_think": 1293.25 + }, + { + "avg_penalty/after_target": 2.5530099868774414, + "avg_penalty/after_think": 1.754914402961731, + "avg_penalty/before_target": 0.3343914598226547, + "avg_penalty/before_think": 0.43127841502428055, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.75, + "completions/max_terminated_length": 427.75, + "completions/mean_length": 187.25, + "completions/mean_terminated_length": 187.25, + "completions/min_length": 37.5, + "completions/min_terminated_length": 37.5, + "epoch": 0.452, + "grad_norm": 6.837617874145508, + "kl": 13.7421875, + "learning_rate": 1.3370952584230823e-05, + "loss": 1.0075, + "num_tokens": 28859792.0, + "reward": 1.41015625, + "reward_std": 0.8714681267738342, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.4757782220840454, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.41714590787887573, + "step": 904, + "token_counts/after_target": 414.25, + "token_counts/after_think": 32.0, + "token_counts/before_target": 1602.25, + "token_counts/before_think": 947.5 + }, + { + "avg_penalty/after_target": 3.2683281302452087, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4285987466573715, + "avg_penalty/before_think": 0.4437098763883114, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 679.25, + "completions/max_terminated_length": 577.25, + "completions/mean_length": 279.3125, + "completions/mean_terminated_length": 267.01354598999023, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.4525, + "grad_norm": 6.276355743408203, + "kl": 18.296875, + "learning_rate": 1.3354515697502552e-05, + "loss": 1.4914, + "num_tokens": 28885844.0, + "reward": 1.41015625, + "reward_std": 0.8037070035934448, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.47663040459156036, + "rewards/tag_count_reward/mean": 0.75390625, + "rewards/tag_count_reward/std": 0.3597647063434124, + "step": 905, + "token_counts/after_target": 768.5, + "token_counts/after_think": 70.75, + "token_counts/before_target": 2035.5, + "token_counts/before_think": 1594.25 + }, + { + "avg_penalty/after_target": 2.5077857673168182, + "avg_penalty/after_think": 3.469623565673828, + "avg_penalty/before_target": 0.42159757018089294, + "avg_penalty/before_think": 0.6204300597310066, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 560.5, + "completions/max_terminated_length": 560.5, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 271.125, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.453, + "grad_norm": 9.612444877624512, + "kl": 19.0625, + "learning_rate": 1.333806859233771e-05, + "loss": 1.3952, + "num_tokens": 28911980.0, + "reward": 1.4296875, + "reward_std": 0.8678483068943024, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4534844756126404, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.425079844892025, + "step": 906, + "token_counts/after_target": 801.5, + "token_counts/after_think": 147.0, + "token_counts/before_target": 2017.5, + "token_counts/before_think": 1372.0 + }, + { + "avg_penalty/after_target": 2.6878820657730103, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4476444348692894, + "avg_penalty/before_think": 0.4515388011932373, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 762.0, + "completions/max_terminated_length": 598.75, + "completions/mean_length": 239.96875, + "completions/mean_terminated_length": 226.38125228881836, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.4535, + "grad_norm": 11.722307205200195, + "kl": 26.09375, + "learning_rate": 1.3321611318837033e-05, + "loss": 2.0876, + "num_tokens": 28936186.0, + "reward": 1.328125, + "reward_std": 0.9470726251602173, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.48558124154806137, + "rewards/tag_count_reward/mean": 0.671875, + "rewards/tag_count_reward/std": 0.4501432478427887, + "step": 907, + "token_counts/after_target": 880.25, + "token_counts/after_think": 22.25, + "token_counts/before_target": 1984.25, + "token_counts/before_think": 952.75 + }, + { + "avg_penalty/after_target": 2.171574056148529, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.41801539063453674, + "avg_penalty/before_think": 0.6948781460523605, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 644.25, + "completions/max_terminated_length": 595.25, + "completions/mean_length": 218.9375, + "completions/mean_terminated_length": 206.4958381652832, + "completions/min_length": 40.25, + "completions/min_terminated_length": 40.25, + "epoch": 0.454, + "grad_norm": 10.094655990600586, + "kl": 19.671875, + "learning_rate": 1.3305143927132232e-05, + "loss": 1.5425, + "num_tokens": 28965222.0, + "reward": 1.37890625, + "reward_std": 0.8840279132127762, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.48025963455438614, + "rewards/tag_count_reward/mean": 0.72265625, + "rewards/tag_count_reward/std": 0.4265768229961395, + "step": 908, + "token_counts/after_target": 760.0, + "token_counts/after_think": 40.25, + "token_counts/before_target": 1771.0, + "token_counts/before_think": 931.75 + }, + { + "avg_penalty/after_target": 2.5403505861759186, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3519874997437, + "avg_penalty/before_think": 0.4908682182431221, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.5, + "completions/max_terminated_length": 638.5, + "completions/mean_length": 246.109375, + "completions/mean_terminated_length": 246.109375, + "completions/min_length": 69.75, + "completions/min_terminated_length": 69.75, + "epoch": 0.4545, + "grad_norm": 3.4112625122070312, + "kl": 17.828125, + "learning_rate": 1.3288666467385834e-05, + "loss": 1.6041, + "num_tokens": 28989581.0, + "reward": 1.578125, + "reward_std": 0.824585348367691, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.42516325414180756, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.4019997715950012, + "step": 909, + "token_counts/after_target": 698.0, + "token_counts/after_think": 170.5, + "token_counts/before_target": 1851.5, + "token_counts/before_think": 1217.75 + }, + { + "avg_penalty/after_target": 1.8010999262332916, + "avg_penalty/after_think": 3.8977638483047485, + "avg_penalty/before_target": 0.4172169715166092, + "avg_penalty/before_think": 0.5305547192692757, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 557.75, + "completions/max_terminated_length": 416.75, + "completions/mean_length": 183.203125, + "completions/mean_terminated_length": 169.91146087646484, + "completions/min_length": 39.75, + "completions/min_terminated_length": 39.75, + "epoch": 0.455, + "grad_norm": 4.632638931274414, + "kl": 11.2578125, + "learning_rate": 1.327217898979104e-05, + "loss": 1.1377, + "num_tokens": 29017946.0, + "reward": 1.69140625, + "reward_std": 0.6994566768407822, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38336414843797684, + "rewards/tag_count_reward/mean": 0.86328125, + "rewards/tag_count_reward/std": 0.3281131684780121, + "step": 910, + "token_counts/after_target": 358.75, + "token_counts/after_think": 79.75, + "token_counts/before_target": 1552.75, + "token_counts/before_think": 940.0 + }, + { + "avg_penalty/after_target": 2.6120620369911194, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.33851105719804764, + "avg_penalty/before_think": 0.4918861985206604, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.5, + "completions/max_terminated_length": 569.5, + "completions/mean_length": 176.234375, + "completions/mean_terminated_length": 176.234375, + "completions/min_length": 52.25, + "completions/min_terminated_length": 52.25, + "epoch": 0.4555, + "grad_norm": 3.4896321296691895, + "kl": 16.875, + "learning_rate": 1.3255681544571568e-05, + "loss": 1.6434, + "num_tokens": 29041177.0, + "reward": 1.703125, + "reward_std": 0.6666163057088852, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.3758598491549492, + "rewards/tag_count_reward/mean": 0.875, + "rewards/tag_count_reward/std": 0.29997093975543976, + "step": 911, + "token_counts/after_target": 449.75, + "token_counts/after_think": 56.0, + "token_counts/before_target": 1491.0, + "token_counts/before_think": 823.0 + }, + { + "avg_penalty/after_target": 2.6044375002384186, + "avg_penalty/after_think": 3.853838086128235, + "avg_penalty/before_target": 0.2881874032318592, + "avg_penalty/before_think": 0.3976106308400631, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.25, + "completions/max_terminated_length": 521.25, + "completions/mean_length": 211.375, + "completions/mean_terminated_length": 211.375, + "completions/min_length": 57.75, + "completions/min_terminated_length": 57.75, + "epoch": 0.456, + "grad_norm": 6.827002048492432, + "kl": 17.01171875, + "learning_rate": 1.3239174181981496e-05, + "loss": 1.2517, + "num_tokens": 29063233.0, + "reward": 1.46875, + "reward_std": 0.7832833528518677, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4284028485417366, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.3689381182193756, + "step": 912, + "token_counts/after_target": 474.5, + "token_counts/after_think": 57.75, + "token_counts/before_target": 1891.5, + "token_counts/before_think": 958.25 + }, + { + "avg_penalty/after_target": 2.5433565378189087, + "avg_penalty/after_think": 2.7408271431922913, + "avg_penalty/before_target": 0.41775912046432495, + "avg_penalty/before_think": 0.4668225944042206, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 629.25, + "completions/max_terminated_length": 629.25, + "completions/mean_length": 192.53125, + "completions/mean_terminated_length": 192.53125, + "completions/min_length": 44.75, + "completions/min_terminated_length": 44.75, + "epoch": 0.4565, + "grad_norm": 3.6697700023651123, + "kl": 19.65625, + "learning_rate": 1.3222656952305113e-05, + "loss": 1.7577, + "num_tokens": 29085283.0, + "reward": 1.58984375, + "reward_std": 0.7727802395820618, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42867646366357803, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.37365298718214035, + "step": 913, + "token_counts/after_target": 484.0, + "token_counts/after_think": 66.75, + "token_counts/before_target": 1512.0, + "token_counts/before_think": 1017.75 + }, + { + "avg_penalty/after_target": 3.0588428378105164, + "avg_penalty/after_think": 3.8427807092666626, + "avg_penalty/before_target": 0.3095688968896866, + "avg_penalty/before_think": 0.6089174598455429, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.75, + "completions/max_terminated_length": 521.75, + "completions/mean_length": 221.28125, + "completions/mean_terminated_length": 221.28125, + "completions/min_length": 61.25, + "completions/min_terminated_length": 61.25, + "epoch": 0.457, + "grad_norm": 3.270066261291504, + "kl": 12.80078125, + "learning_rate": 1.3206129905856765e-05, + "loss": 1.2498, + "num_tokens": 29110997.0, + "reward": 1.67578125, + "reward_std": 0.5767718702554703, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.29886941611766815, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.28015752881765366, + "step": 914, + "token_counts/after_target": 574.5, + "token_counts/after_think": 112.25, + "token_counts/before_target": 1871.25, + "token_counts/before_think": 982.5 + }, + { + "avg_penalty/after_target": 2.611913502216339, + "avg_penalty/after_think": 3.52190500497818, + "avg_penalty/before_target": 0.36235949397087097, + "avg_penalty/before_think": 0.6082897298038006, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.25, + "completions/max_terminated_length": 529.25, + "completions/mean_length": 199.5625, + "completions/mean_terminated_length": 199.5625, + "completions/min_length": 35.5, + "completions/min_terminated_length": 35.5, + "epoch": 0.4575, + "grad_norm": 3.203482151031494, + "kl": 14.9375, + "learning_rate": 1.3189593092980701e-05, + "loss": 1.2312, + "num_tokens": 29133673.0, + "reward": 1.5234375, + "reward_std": 0.8143589049577713, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4519384130835533, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.3796655535697937, + "step": 915, + "token_counts/after_target": 592.75, + "token_counts/after_think": 56.0, + "token_counts/before_target": 1603.75, + "token_counts/before_think": 940.5 + }, + { + "avg_penalty/after_target": 2.5742907524108887, + "avg_penalty/after_think": 2.9037991166114807, + "avg_penalty/before_target": 0.24916193634271622, + "avg_penalty/before_think": 0.6018060706555843, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.5, + "completions/max_terminated_length": 529.5, + "completions/mean_length": 214.59375, + "completions/mean_terminated_length": 214.59375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.458, + "grad_norm": 3.275193929672241, + "kl": 12.125, + "learning_rate": 1.3173046564050923e-05, + "loss": 1.0728, + "num_tokens": 29157487.0, + "reward": 1.5546875, + "reward_std": 0.7730326056480408, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.43399807065725327, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.35095974057912827, + "step": 916, + "token_counts/after_target": 426.75, + "token_counts/after_think": 205.0, + "token_counts/before_target": 1596.5, + "token_counts/before_think": 1205.25 + }, + { + "avg_penalty/after_target": 1.8592305779457092, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4043419286608696, + "avg_penalty/before_think": 0.636614203453064, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.5, + "completions/max_terminated_length": 625.5, + "completions/mean_length": 234.6875, + "completions/mean_terminated_length": 234.6875, + "completions/min_length": 33.5, + "completions/min_terminated_length": 33.5, + "epoch": 0.4585, + "grad_norm": 2.987848997116089, + "kl": 13.625, + "learning_rate": 1.3156490369471026e-05, + "loss": 1.2397, + "num_tokens": 29183787.0, + "reward": 1.64453125, + "reward_std": 0.7367480993270874, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.39123913645744324, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.34642939269542694, + "step": 917, + "token_counts/after_target": 623.75, + "token_counts/after_think": 113.75, + "token_counts/before_target": 1711.0, + "token_counts/before_think": 1306.5 + }, + { + "avg_penalty/after_target": 2.544963240623474, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3804715871810913, + "avg_penalty/before_think": 0.5632360503077507, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.75, + "completions/max_terminated_length": 631.75, + "completions/mean_length": 269.0, + "completions/mean_terminated_length": 269.0, + "completions/min_length": 46.5, + "completions/min_terminated_length": 46.5, + "epoch": 0.459, + "grad_norm": 18.52060890197754, + "kl": 16.890625, + "learning_rate": 1.313992455967405e-05, + "loss": 1.2841, + "num_tokens": 29215579.0, + "reward": 1.453125, + "reward_std": 0.846356987953186, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.45028156042099, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.40667247027158737, + "step": 918, + "token_counts/after_target": 791.0, + "token_counts/after_think": 53.0, + "token_counts/before_target": 1867.75, + "token_counts/before_think": 1592.25 + }, + { + "avg_penalty/after_target": 1.56993168592453, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.47457533329725266, + "avg_penalty/before_think": 0.6211685091257095, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.5, + "completions/max_terminated_length": 822.5, + "completions/mean_length": 299.203125, + "completions/mean_terminated_length": 299.203125, + "completions/min_length": 64.5, + "completions/min_terminated_length": 64.5, + "epoch": 0.4595, + "grad_norm": 4.240321636199951, + "kl": 17.484375, + "learning_rate": 1.3123349185122328e-05, + "loss": 1.395, + "num_tokens": 29242136.0, + "reward": 1.56640625, + "reward_std": 0.7902878522872925, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.44721361994743347, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.3646910637617111, + "step": 919, + "token_counts/after_target": 868.25, + "token_counts/after_think": 249.25, + "token_counts/before_target": 2168.75, + "token_counts/before_think": 1501.0 + }, + { + "avg_penalty/after_target": 2.710691213607788, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.4330180734395981, + "avg_penalty/before_think": 0.5856007263064384, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 761.5, + "completions/max_terminated_length": 761.5, + "completions/mean_length": 310.53125, + "completions/mean_terminated_length": 310.53125, + "completions/min_length": 57.25, + "completions/min_terminated_length": 57.25, + "epoch": 0.46, + "grad_norm": 2.890228748321533, + "kl": 16.515625, + "learning_rate": 1.310676429630732e-05, + "loss": 1.4602, + "num_tokens": 29271434.0, + "reward": 1.4609375, + "reward_std": 0.836342990398407, + "rewards/accuracy_reward/mean": NaN, + "rewards/accuracy_reward/std": NaN, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4546433389186859, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.3956945091485977, + "step": 920, + "token_counts/after_target": 1120.25, + "token_counts/after_think": 112.25, + "token_counts/before_target": 2074.0, + "token_counts/before_think": 1662.0 + }, + { + "avg_penalty/after_target": 2.455937922000885, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5042269341647625, + "avg_penalty/before_think": 0.624396376311779, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.5, + "completions/max_terminated_length": 646.5, + "completions/mean_length": 262.640625, + "completions/mean_terminated_length": 262.640625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.4605, + "grad_norm": 2.5661027431488037, + "kl": 13.58203125, + "learning_rate": 1.3090169943749475e-05, + "loss": 1.1753, + "num_tokens": 29301635.0, + "reward": 1.62890625, + "reward_std": 0.7144286334514618, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4022643193602562, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.3192904256284237, + "step": 921, + "token_counts/after_target": 983.5, + "token_counts/after_think": 176.75, + "token_counts/before_target": 1815.0, + "token_counts/before_think": 1227.0 + }, + { + "avg_penalty/after_target": 2.194880396127701, + "avg_penalty/after_think": 2.8660274147987366, + "avg_penalty/before_target": 0.4588818848133087, + "avg_penalty/before_think": 0.5884121507406235, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.25, + "completions/max_terminated_length": 592.25, + "completions/mean_length": 274.5625, + "completions/mean_terminated_length": 274.5625, + "completions/min_length": 73.25, + "completions/min_terminated_length": 73.25, + "epoch": 0.461, + "grad_norm": 8.367683410644531, + "kl": 15.6953125, + "learning_rate": 1.3073566177998073e-05, + "loss": 1.2915, + "num_tokens": 29329655.0, + "reward": 1.5390625, + "reward_std": 0.89283487200737, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.4119941517710686, + "step": 922, + "token_counts/after_target": 969.25, + "token_counts/after_think": 42.0, + "token_counts/before_target": 1942.5, + "token_counts/before_think": 1439.25 + }, + { + "avg_penalty/after_target": 2.3648266196250916, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.5285761654376984, + "avg_penalty/before_think": 0.7787685543298721, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 852.75, + "completions/max_terminated_length": 834.75, + "completions/mean_length": 428.59375, + "completions/mean_terminated_length": 419.8572998046875, + "completions/min_length": 133.25, + "completions/min_terminated_length": 133.25, + "epoch": 0.4615, + "grad_norm": 2.454967975616455, + "kl": 16.78125, + "learning_rate": 1.3056953049631059e-05, + "loss": 1.5233, + "num_tokens": 29367277.0, + "reward": 1.4765625, + "reward_std": 0.8367017954587936, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.39394544064998627, + "step": 923, + "token_counts/after_target": 1809.25, + "token_counts/after_think": 72.5, + "token_counts/before_target": 2930.0, + "token_counts/before_think": 2045.75 + }, + { + "avg_penalty/after_target": 2.072396367788315, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.41064032167196274, + "avg_penalty/before_think": 0.7309859395027161, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.25, + "completions/max_terminated_length": 601.25, + "completions/mean_length": 286.8125, + "completions/mean_terminated_length": 286.8125, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.462, + "grad_norm": 2.7554354667663574, + "kl": 18.640625, + "learning_rate": 1.3040330609254903e-05, + "loss": 1.5953, + "num_tokens": 29394609.0, + "reward": 1.49609375, + "reward_std": 0.8222716003656387, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4462348371744156, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.3857652619481087, + "step": 924, + "token_counts/after_target": 1000.25, + "token_counts/after_think": 149.5, + "token_counts/before_target": 1949.5, + "token_counts/before_think": 1489.75 + }, + { + "avg_penalty/after_target": 2.2679565846920013, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.49167725443840027, + "avg_penalty/before_think": 0.5319945216178894, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 761.75, + "completions/max_terminated_length": 682.5, + "completions/mean_length": 310.609375, + "completions/mean_terminated_length": 300.87084197998047, + "completions/min_length": 73.5, + "completions/min_terminated_length": 73.5, + "epoch": 0.4625, + "grad_norm": 2.7653112411499023, + "kl": 18.3125, + "learning_rate": 1.3023698907504447e-05, + "loss": 1.5922, + "num_tokens": 29423656.0, + "reward": 1.578125, + "reward_std": 0.8044310957193375, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.42516325414180756, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.38658592849969864, + "step": 925, + "token_counts/after_target": 914.75, + "token_counts/after_think": 138.5, + "token_counts/before_target": 2333.25, + "token_counts/before_think": 1583.25 + }, + { + "avg_penalty/after_target": 2.3242334723472595, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4340265840291977, + "avg_penalty/before_think": 0.7666516155004501, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.5, + "completions/max_terminated_length": 764.5, + "completions/mean_length": 371.6875, + "completions/mean_terminated_length": 371.6875, + "completions/min_length": 100.5, + "completions/min_terminated_length": 100.5, + "epoch": 0.463, + "grad_norm": 2.865199565887451, + "kl": 11.625, + "learning_rate": 1.300705799504273e-05, + "loss": 1.1142, + "num_tokens": 29457444.0, + "reward": 1.5390625, + "reward_std": 0.799898624420166, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.41503459960222244, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.39269766956567764, + "step": 926, + "token_counts/after_target": 1287.25, + "token_counts/after_think": 186.5, + "token_counts/before_target": 2502.75, + "token_counts/before_think": 1970.5 + }, + { + "avg_penalty/after_target": 2.5226216912269592, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4126950204372406, + "avg_penalty/before_think": 0.7140631526708603, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 596.25, + "completions/max_terminated_length": 596.25, + "completions/mean_length": 298.78125, + "completions/mean_terminated_length": 298.78125, + "completions/min_length": 52.75, + "completions/min_terminated_length": 52.75, + "epoch": 0.4635, + "grad_norm": 14.906435012817383, + "kl": 25.578125, + "learning_rate": 1.2990407922560869e-05, + "loss": 1.7253, + "num_tokens": 29484214.0, + "reward": 1.1171875, + "reward_std": 0.9231482893228531, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.53125, + "rewards/format_reward/std": 0.49345622956752777, + "rewards/tag_count_reward/mean": 0.5859375, + "rewards/tag_count_reward/std": 0.4408072605729103, + "step": 927, + "token_counts/after_target": 1203.0, + "token_counts/after_think": 51.25, + "token_counts/before_target": 2639.0, + "token_counts/before_think": 887.25 + }, + { + "avg_penalty/after_target": 1.9196370542049408, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.46311333775520325, + "avg_penalty/before_think": 0.8686888366937637, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 818.75, + "completions/max_terminated_length": 817.5, + "completions/mean_length": 374.546875, + "completions/mean_terminated_length": 366.0073013305664, + "completions/min_length": 63.75, + "completions/min_terminated_length": 63.75, + "epoch": 0.464, + "grad_norm": 5.193065166473389, + "kl": 21.046875, + "learning_rate": 1.297374874077786e-05, + "loss": 1.7263, + "num_tokens": 29519369.0, + "reward": 1.296875, + "reward_std": 0.895855188369751, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.4876555874943733, + "rewards/tag_count_reward/mean": 0.6875, + "rewards/tag_count_reward/std": 0.4324247017502785, + "step": 928, + "token_counts/after_target": 1482.5, + "token_counts/after_think": 114.25, + "token_counts/before_target": 3082.25, + "token_counts/before_think": 1313.75 + }, + { + "avg_penalty/after_target": 1.891130805015564, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5404434651136398, + "avg_penalty/before_think": 0.6810438930988312, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 716.75, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 364.15625, + "completions/mean_terminated_length": 353.5093765258789, + "completions/min_length": 86.5, + "completions/min_terminated_length": 86.5, + "epoch": 0.4645, + "grad_norm": 6.494959831237793, + "kl": 21.109375, + "learning_rate": 1.2957080500440469e-05, + "loss": 1.6251, + "num_tokens": 29550755.0, + "reward": 1.15625, + "reward_std": 0.9355426728725433, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.53125, + "rewards/format_reward/std": 0.50393907725811, + "rewards/tag_count_reward/mean": 0.625, + "rewards/tag_count_reward/std": 0.4635772630572319, + "step": 929, + "token_counts/after_target": 1289.5, + "token_counts/after_think": 241.5, + "token_counts/before_target": 2764.0, + "token_counts/before_think": 1531.5 + }, + { + "avg_penalty/after_target": 2.1911709010601044, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.41578108072280884, + "avg_penalty/before_think": 0.68716199696064, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 739.5, + "completions/max_terminated_length": 739.5, + "completions/mean_length": 339.75, + "completions/mean_terminated_length": 339.75, + "completions/min_length": 31.25, + "completions/min_terminated_length": 31.25, + "epoch": 0.465, + "grad_norm": 6.0276384353637695, + "kl": 10.390625, + "learning_rate": 1.294040325232304e-05, + "loss": 1.0991, + "num_tokens": 29580547.0, + "reward": 1.37109375, + "reward_std": 0.8499381691217422, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.4622559919953346, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.40502065420150757, + "step": 930, + "token_counts/after_target": 1118.0, + "token_counts/after_think": 349.0, + "token_counts/before_target": 2021.5, + "token_counts/before_think": 1947.5 + }, + { + "avg_penalty/after_target": 2.6378543972969055, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.4830818772315979, + "avg_penalty/before_think": 0.7407204657793045, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 845.5, + "completions/max_terminated_length": 787.25, + "completions/mean_length": 359.203125, + "completions/mean_terminated_length": 339.4718780517578, + "completions/min_length": 92.25, + "completions/min_terminated_length": 92.25, + "epoch": 0.4655, + "grad_norm": 3.1297566890716553, + "kl": 17.265625, + "learning_rate": 1.2923717047227368e-05, + "loss": 1.4896, + "num_tokens": 29611968.0, + "reward": 1.171875, + "reward_std": 0.966417133808136, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.5625, + "rewards/format_reward/std": 0.50393907725811, + "rewards/tag_count_reward/mean": 0.609375, + "rewards/tag_count_reward/std": 0.4743010550737381, + "step": 931, + "token_counts/after_target": 1527.5, + "token_counts/after_think": 118.0, + "token_counts/before_target": 2597.75, + "token_counts/before_think": 1504.0 + }, + { + "avg_penalty/after_target": 2.375340849161148, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.39576178044080734, + "avg_penalty/before_think": 0.6042030155658722, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 723.25, + "completions/max_terminated_length": 641.0, + "completions/mean_length": 323.890625, + "completions/mean_terminated_length": 313.2291717529297, + "completions/min_length": 92.25, + "completions/min_terminated_length": 92.25, + "epoch": 0.466, + "grad_norm": 4.518115043640137, + "kl": 15.875, + "learning_rate": 1.2907021935982526e-05, + "loss": 1.4017, + "num_tokens": 29640441.0, + "reward": 1.1953125, + "reward_std": 0.9107872098684311, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.498777836561203, + "rewards/tag_count_reward/mean": 0.6484375, + "rewards/tag_count_reward/std": 0.4489165470004082, + "step": 932, + "token_counts/after_target": 1023.5, + "token_counts/after_think": 154.5, + "token_counts/before_target": 2304.75, + "token_counts/before_think": 1699.5 + }, + { + "avg_penalty/after_target": 2.3021220564842224, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3792773634195328, + "avg_penalty/before_think": 0.5682803764939308, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.75, + "completions/max_terminated_length": 677.75, + "completions/mean_length": 328.875, + "completions/mean_terminated_length": 328.875, + "completions/min_length": 89.25, + "completions/min_terminated_length": 89.25, + "epoch": 0.4665, + "grad_norm": 5.653041839599609, + "kl": 12.203125, + "learning_rate": 1.2890317969444716e-05, + "loss": 0.9038, + "num_tokens": 29670465.0, + "reward": 1.19921875, + "reward_std": 0.8903580904006958, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.4682852029800415, + "rewards/tag_count_reward/mean": 0.62109375, + "rewards/tag_count_reward/std": 0.4429647773504257, + "step": 933, + "token_counts/after_target": 942.5, + "token_counts/after_think": 131.0, + "token_counts/before_target": 2374.25, + "token_counts/before_think": 1814.25 + }, + { + "avg_penalty/after_target": 2.1793966591358185, + "avg_penalty/after_think": 2.995570123195648, + "avg_penalty/before_target": 0.472061850130558, + "avg_penalty/before_think": 0.6771627813577652, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 807.0, + "completions/max_terminated_length": 807.0, + "completions/mean_length": 369.65625, + "completions/mean_terminated_length": 369.65625, + "completions/min_length": 52.75, + "completions/min_terminated_length": 52.75, + "epoch": 0.467, + "grad_norm": 3.98639178276062, + "kl": 14.75, + "learning_rate": 1.2873605198497123e-05, + "loss": 1.3449, + "num_tokens": 29705227.0, + "reward": 1.1640625, + "reward_std": 0.9188015460968018, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.53125, + "rewards/format_reward/std": 0.5102732330560684, + "rewards/tag_count_reward/mean": 0.6328125, + "rewards/tag_count_reward/std": 0.43992631137371063, + "step": 934, + "token_counts/after_target": 1336.0, + "token_counts/after_think": 210.5, + "token_counts/before_target": 2594.75, + "token_counts/before_think": 1773.25 + }, + { + "avg_penalty/after_target": 2.068385988473892, + "avg_penalty/after_think": 3.948889911174774, + "avg_penalty/before_target": 0.5111168771982193, + "avg_penalty/before_think": 0.7583637833595276, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 766.0, + "completions/max_terminated_length": 766.0, + "completions/mean_length": 371.375, + "completions/mean_terminated_length": 371.375, + "completions/min_length": 53.5, + "completions/min_terminated_length": 53.5, + "epoch": 0.4675, + "grad_norm": 6.517355442047119, + "kl": 12.28125, + "learning_rate": 1.2856883674049736e-05, + "loss": 1.3021, + "num_tokens": 29736435.0, + "reward": 1.27734375, + "reward_std": 0.8774633854627609, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.47663040459156036, + "rewards/tag_count_reward/mean": 0.68359375, + "rewards/tag_count_reward/std": 0.4298105537891388, + "step": 935, + "token_counts/after_target": 1400.25, + "token_counts/after_think": 158.0, + "token_counts/before_target": 2731.5, + "token_counts/before_think": 1652.25 + }, + { + "avg_penalty/after_target": 2.3970832228660583, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.5434696674346924, + "avg_penalty/before_think": 0.4883228726685047, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 784.25, + "completions/max_terminated_length": 784.25, + "completions/mean_length": 378.015625, + "completions/mean_terminated_length": 378.015625, + "completions/min_length": 72.5, + "completions/min_terminated_length": 72.5, + "epoch": 0.468, + "grad_norm": 2.9336047172546387, + "kl": 13.84375, + "learning_rate": 1.284015344703923e-05, + "loss": 1.2532, + "num_tokens": 29771780.0, + "reward": 1.26953125, + "reward_std": 0.8985690623521805, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.5018647313117981, + "rewards/tag_count_reward/mean": 0.67578125, + "rewards/tag_count_reward/std": 0.4308156222105026, + "step": 936, + "token_counts/after_target": 1442.25, + "token_counts/after_think": 35.25, + "token_counts/before_target": 2483.0, + "token_counts/before_think": 2087.75 + }, + { + "avg_penalty/after_target": 2.4112367928028107, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.44811809062957764, + "avg_penalty/before_think": 0.6036261022090912, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.75, + "completions/max_terminated_length": 668.75, + "completions/mean_length": 329.71875, + "completions/mean_terminated_length": 329.71875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.4685, + "grad_norm": 3.751420021057129, + "kl": 17.0625, + "learning_rate": 1.2823414568428767e-05, + "loss": 1.5364, + "num_tokens": 29801906.0, + "reward": 1.2109375, + "reward_std": 0.9785890728235245, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.49654312431812286, + "rewards/tag_count_reward/mean": 0.6171875, + "rewards/tag_count_reward/std": 0.47202320396900177, + "step": 937, + "token_counts/after_target": 1201.0, + "token_counts/after_think": 195.75, + "token_counts/before_target": 2690.75, + "token_counts/before_think": 1188.0 + }, + { + "avg_penalty/after_target": 2.602775752544403, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.32726743817329407, + "avg_penalty/before_think": 0.5396305732429028, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 295.09375, + "completions/mean_terminated_length": 295.09375, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.469, + "grad_norm": 7.8985595703125, + "kl": 17.171875, + "learning_rate": 1.280666708920788e-05, + "loss": 1.2481, + "num_tokens": 29835112.0, + "reward": 1.11328125, + "reward_std": 0.9365658462047577, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.515625, + "rewards/format_reward/std": 0.4896806851029396, + "rewards/tag_count_reward/mean": 0.59765625, + "rewards/tag_count_reward/std": 0.48188164085149765, + "step": 938, + "token_counts/after_target": 720.75, + "token_counts/after_think": 252.5, + "token_counts/before_target": 2424.25, + "token_counts/before_think": 1324.0 + }, + { + "avg_penalty/after_target": 2.483809620141983, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.4786337539553642, + "avg_penalty/before_think": 0.4765492305159569, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 667.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 287.0625, + "completions/mean_terminated_length": 275.02917098999023, + "completions/min_length": 55.5, + "completions/min_terminated_length": 55.5, + "epoch": 0.4695, + "grad_norm": 3.881647825241089, + "kl": 13.5, + "learning_rate": 1.2789911060392295e-05, + "loss": 1.2342, + "num_tokens": 29862236.0, + "reward": 1.3515625, + "reward_std": 0.9298542737960815, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.49467839300632477, + "rewards/tag_count_reward/mean": 0.6953125, + "rewards/tag_count_reward/std": 0.44841913133859634, + "step": 939, + "token_counts/after_target": 1062.75, + "token_counts/after_think": 16.25, + "token_counts/before_target": 1816.5, + "token_counts/before_think": 1697.5 + }, + { + "avg_penalty/after_target": 2.1421916782855988, + "avg_penalty/after_think": 2.9650646448135376, + "avg_penalty/before_target": 0.3323100619018078, + "avg_penalty/before_think": 0.5166141241788864, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 677.75, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 254.65625, + "completions/mean_terminated_length": 242.94895935058594, + "completions/min_length": 55.75, + "completions/min_terminated_length": 55.75, + "epoch": 0.47, + "grad_norm": 3.0600452423095703, + "kl": 14.875, + "learning_rate": 1.2773146533023782e-05, + "loss": 1.3278, + "num_tokens": 29890422.0, + "reward": 1.42578125, + "reward_std": 0.8592235594987869, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4613594636321068, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.4159100875258446, + "step": 940, + "token_counts/after_target": 704.5, + "token_counts/after_think": 193.0, + "token_counts/before_target": 1894.0, + "token_counts/before_think": 1283.0 + }, + { + "avg_penalty/after_target": 2.951343059539795, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3335885778069496, + "avg_penalty/before_think": 0.4308439567685127, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 229.5625, + "completions/mean_terminated_length": 229.5625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.4705, + "grad_norm": 5.9727091789245605, + "kl": 20.8359375, + "learning_rate": 1.2756373558169992e-05, + "loss": 1.6847, + "num_tokens": 29912538.0, + "reward": 1.3671875, + "reward_std": 0.8597365617752075, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.4819520115852356, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.4053691551089287, + "step": 941, + "token_counts/after_target": 725.0, + "token_counts/after_think": 16.0, + "token_counts/before_target": 1969.0, + "token_counts/before_think": 963.0 + }, + { + "avg_penalty/after_target": 2.5319738686084747, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.2709188833832741, + "avg_penalty/before_think": 0.5102051869034767, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.75, + "completions/max_terminated_length": 513.75, + "completions/mean_length": 215.765625, + "completions/mean_terminated_length": 215.765625, + "completions/min_length": 32.5, + "completions/min_terminated_length": 32.5, + "epoch": 0.471, + "grad_norm": 14.352804183959961, + "kl": 20.21875, + "learning_rate": 1.2739592186924327e-05, + "loss": 1.2587, + "num_tokens": 29937883.0, + "reward": 1.2890625, + "reward_std": 0.9463854283094406, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.4955305755138397, + "rewards/tag_count_reward/mean": 0.6640625, + "rewards/tag_count_reward/std": 0.4584831967949867, + "step": 942, + "token_counts/after_target": 441.0, + "token_counts/after_think": 40.0, + "token_counts/before_target": 1739.25, + "token_counts/before_think": 1232.0 + }, + { + "avg_penalty/after_target": 2.295345216989517, + "avg_penalty/after_think": 2.7582860589027405, + "avg_penalty/before_target": 0.36775587499141693, + "avg_penalty/before_think": 0.3900612071156502, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.0, + "completions/max_terminated_length": 539.0, + "completions/mean_length": 211.3125, + "completions/mean_terminated_length": 211.3125, + "completions/min_length": 41.25, + "completions/min_terminated_length": 41.25, + "epoch": 0.4715, + "grad_norm": 17.538219451904297, + "kl": 25.5, + "learning_rate": 1.2722802470405744e-05, + "loss": 1.547, + "num_tokens": 29960367.0, + "reward": 1.1640625, + "reward_std": 1.0231481045484543, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.11180340498685837, + "rewards/format_reward/mean": 0.515625, + "rewards/format_reward/std": 0.5071863383054733, + "rewards/tag_count_reward/mean": 0.5859375, + "rewards/tag_count_reward/std": 0.46718520671129227, + "step": 943, + "token_counts/after_target": 442.75, + "token_counts/after_think": 38.0, + "token_counts/before_target": 2151.75, + "token_counts/before_think": 748.5 + }, + { + "avg_penalty/after_target": 2.1319454312324524, + "avg_penalty/after_think": 2.9993419647216797, + "avg_penalty/before_target": 0.3166241869330406, + "avg_penalty/before_think": 0.5336083620786667, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 178.78125, + "completions/mean_terminated_length": 178.78125, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.472, + "grad_norm": 3.580526351928711, + "kl": 21.4375, + "learning_rate": 1.2706004459758636e-05, + "loss": 1.6943, + "num_tokens": 29982049.0, + "reward": 1.4453125, + "reward_std": 0.8494500517845154, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4704566150903702, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.3927219808101654, + "step": 944, + "token_counts/after_target": 390.0, + "token_counts/after_think": 93.5, + "token_counts/before_target": 1661.75, + "token_counts/before_think": 715.25 + }, + { + "avg_penalty/after_target": 2.491413176059723, + "avg_penalty/after_think": 2.8170096278190613, + "avg_penalty/before_target": 0.4372256435453892, + "avg_penalty/before_think": 0.6205784901976585, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 714.5, + "completions/max_terminated_length": 560.5, + "completions/mean_length": 221.703125, + "completions/mean_terminated_length": 208.1750030517578, + "completions/min_length": 36.25, + "completions/min_terminated_length": 36.25, + "epoch": 0.4725, + "grad_norm": 20.677417755126953, + "kl": 23.96875, + "learning_rate": 1.2689198206152657e-05, + "loss": 2.0062, + "num_tokens": 30008782.0, + "reward": 1.2578125, + "reward_std": 0.9721125662326813, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.4909028485417366, + "rewards/tag_count_reward/mean": 0.6328125, + "rewards/tag_count_reward/std": 0.4819914475083351, + "step": 945, + "token_counts/after_target": 805.0, + "token_counts/after_think": 84.75, + "token_counts/before_target": 2005.25, + "token_counts/before_think": 652.25 + }, + { + "avg_penalty/after_target": 2.1651197969913483, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3401145078241825, + "avg_penalty/before_think": 0.5301640331745148, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.5, + "completions/max_terminated_length": 478.5, + "completions/mean_length": 198.921875, + "completions/mean_terminated_length": 198.921875, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.473, + "grad_norm": 4.487083435058594, + "kl": 14.328125, + "learning_rate": 1.267238376078257e-05, + "loss": 1.3507, + "num_tokens": 30032281.0, + "reward": 1.60546875, + "reward_std": 0.7913870513439178, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.38578568398952484, + "step": 946, + "token_counts/after_target": 528.5, + "token_counts/after_think": 63.75, + "token_counts/before_target": 1753.5, + "token_counts/before_think": 837.0 + }, + { + "avg_penalty/after_target": 2.060414582490921, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4952646493911743, + "avg_penalty/before_think": 0.4321221485733986, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.75, + "completions/max_terminated_length": 594.75, + "completions/mean_length": 218.765625, + "completions/mean_terminated_length": 218.765625, + "completions/min_length": 61.75, + "completions/min_terminated_length": 61.75, + "epoch": 0.4735, + "grad_norm": 8.644635200500488, + "kl": 18.109375, + "learning_rate": 1.265556117486809e-05, + "loss": 1.7445, + "num_tokens": 30057498.0, + "reward": 1.47265625, + "reward_std": 0.8633173257112503, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.75390625, + "rewards/tag_count_reward/std": 0.41186822950839996, + "step": 947, + "token_counts/after_target": 643.5, + "token_counts/after_think": 75.5, + "token_counts/before_target": 1624.0, + "token_counts/before_think": 1157.25 + }, + { + "avg_penalty/after_target": 3.015868455171585, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.25584251806139946, + "avg_penalty/before_think": 0.43273765593767166, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 179.6875, + "completions/mean_terminated_length": 179.6875, + "completions/min_length": 55.5, + "completions/min_terminated_length": 55.5, + "epoch": 0.474, + "grad_norm": 11.010610580444336, + "kl": 17.0, + "learning_rate": 1.2638730499653731e-05, + "loss": 1.753, + "num_tokens": 30078310.0, + "reward": 1.62890625, + "reward_std": 0.7895528823137283, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3987511098384857, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.3912878558039665, + "step": 948, + "token_counts/after_target": 462.5, + "token_counts/after_think": 39.25, + "token_counts/before_target": 1488.0, + "token_counts/before_think": 885.25 + }, + { + "avg_penalty/after_target": 2.9065463542938232, + "avg_penalty/after_think": 1.8507823944091797, + "avg_penalty/before_target": 0.43678903207182884, + "avg_penalty/before_think": 0.39186811819672585, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 790.5, + "completions/max_terminated_length": 553.25, + "completions/mean_length": 230.828125, + "completions/mean_terminated_length": 204.78438186645508, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.4745, + "grad_norm": 12.404436111450195, + "kl": 25.578125, + "learning_rate": 1.2621891786408648e-05, + "loss": 2.4477, + "num_tokens": 30104587.0, + "reward": 1.375, + "reward_std": 0.9103951007127762, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.4757782220840454, + "rewards/tag_count_reward/mean": 0.703125, + "rewards/tag_count_reward/std": 0.4423694387078285, + "step": 949, + "token_counts/after_target": 1247.75, + "token_counts/after_think": 85.75, + "token_counts/before_target": 1660.75, + "token_counts/before_think": 699.0 + }, + { + "avg_penalty/after_target": 2.5599484741687775, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3683093413710594, + "avg_penalty/before_think": 0.45076828449964523, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 666.25, + "completions/max_terminated_length": 666.25, + "completions/mean_length": 220.21875, + "completions/mean_terminated_length": 220.21875, + "completions/min_length": 46.75, + "completions/min_terminated_length": 46.75, + "epoch": 0.475, + "grad_norm": 9.444982528686523, + "kl": 30.875, + "learning_rate": 1.2605045086426487e-05, + "loss": 2.3531, + "num_tokens": 30127273.0, + "reward": 1.32421875, + "reward_std": 0.9264971166849136, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.4819520115852356, + "rewards/tag_count_reward/mean": 0.66796875, + "rewards/tag_count_reward/std": 0.45717287063598633, + "step": 950, + "token_counts/after_target": 719.5, + "token_counts/after_think": 30.0, + "token_counts/before_target": 2277.75, + "token_counts/before_think": 496.25 + }, + { + "avg_penalty/after_target": 2.047226518392563, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.36355050653219223, + "avg_penalty/before_think": 0.3696665018796921, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 656.5, + "completions/max_terminated_length": 498.5, + "completions/mean_length": 170.171875, + "completions/mean_terminated_length": 155.94375228881836, + "completions/min_length": 36.25, + "completions/min_terminated_length": 36.25, + "epoch": 0.4755, + "grad_norm": 7.169468879699707, + "kl": 24.71875, + "learning_rate": 1.2588190451025209e-05, + "loss": 1.8388, + "num_tokens": 30146388.0, + "reward": 1.453125, + "reward_std": 0.8955731093883514, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4604102149605751, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.43620988726615906, + "step": 951, + "token_counts/after_target": 420.75, + "token_counts/after_think": 31.5, + "token_counts/before_target": 1514.75, + "token_counts/before_think": 755.75 + }, + { + "avg_penalty/after_target": 2.5561118125915527, + "avg_penalty/after_think": 2.810328483581543, + "avg_penalty/before_target": 0.28195005282759666, + "avg_penalty/before_think": 0.3785044848918915, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.5, + "completions/max_terminated_length": 414.5, + "completions/mean_length": 171.734375, + "completions/mean_terminated_length": 171.734375, + "completions/min_length": 54.75, + "completions/min_terminated_length": 54.75, + "epoch": 0.476, + "grad_norm": 2.8328733444213867, + "kl": 18.828125, + "learning_rate": 1.2571327931546964e-05, + "loss": 1.5707, + "num_tokens": 30168483.0, + "reward": 1.5625, + "reward_std": 0.8238613456487656, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4361884370446205, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.3964197337627411, + "step": 952, + "token_counts/after_target": 311.5, + "token_counts/after_think": 120.5, + "token_counts/before_target": 1579.5, + "token_counts/before_think": 736.25 + }, + { + "avg_penalty/after_target": 2.646948963403702, + "avg_penalty/after_think": 2.968027174472809, + "avg_penalty/before_target": 0.3564397096633911, + "avg_penalty/before_think": 0.42098578810691833, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 607.75, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 186.671875, + "completions/mean_terminated_length": 173.99792098999023, + "completions/min_length": 59.25, + "completions/min_terminated_length": 59.25, + "epoch": 0.4765, + "grad_norm": 3.307917594909668, + "kl": 21.234375, + "learning_rate": 1.2554457579357906e-05, + "loss": 1.862, + "num_tokens": 30191406.0, + "reward": 1.546875, + "reward_std": 0.771123081445694, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.404181070625782, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.3599751591682434, + "step": 953, + "token_counts/after_target": 455.0, + "token_counts/after_think": 35.25, + "token_counts/before_target": 1637.0, + "token_counts/before_think": 859.5 + }, + { + "avg_penalty/after_target": 2.368818074464798, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3995356149971485, + "avg_penalty/before_think": 0.45980483293533325, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.5, + "completions/max_terminated_length": 568.5, + "completions/mean_length": 172.09375, + "completions/mean_terminated_length": 172.09375, + "completions/min_length": 44.5, + "completions/min_terminated_length": 44.5, + "epoch": 0.477, + "grad_norm": 6.795891761779785, + "kl": 18.0625, + "learning_rate": 1.2537579445848058e-05, + "loss": 1.6335, + "num_tokens": 30215316.0, + "reward": 1.59375, + "reward_std": 0.7723339647054672, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.41503459960222244, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.353743739426136, + "step": 954, + "token_counts/after_target": 541.75, + "token_counts/after_think": 20.0, + "token_counts/before_target": 1322.25, + "token_counts/before_think": 869.5 + }, + { + "avg_penalty/after_target": 2.969746947288513, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.2794124558568001, + "avg_penalty/before_think": 0.5820942893624306, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.5, + "completions/max_terminated_length": 354.5, + "completions/mean_length": 168.96875, + "completions/mean_terminated_length": 168.96875, + "completions/min_length": 32.25, + "completions/min_terminated_length": 32.25, + "epoch": 0.4775, + "grad_norm": 3.4034459590911865, + "kl": 16.875, + "learning_rate": 1.252069358243114e-05, + "loss": 1.443, + "num_tokens": 30235522.0, + "reward": 1.5703125, + "reward_std": 0.7839806526899338, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42867646366357803, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.3607623912394047, + "step": 955, + "token_counts/after_target": 296.0, + "token_counts/after_think": 115.0, + "token_counts/before_target": 1471.75, + "token_counts/before_think": 820.75 + }, + { + "avg_penalty/after_target": 3.5555700063705444, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4144027754664421, + "avg_penalty/before_think": 0.30264293774962425, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.25, + "completions/max_terminated_length": 582.25, + "completions/mean_length": 152.4375, + "completions/mean_terminated_length": 152.4375, + "completions/min_length": 43.25, + "completions/min_terminated_length": 43.25, + "epoch": 0.478, + "grad_norm": 7.707114219665527, + "kl": 22.90625, + "learning_rate": 1.2503800040544417e-05, + "loss": 2.1655, + "num_tokens": 30256878.0, + "reward": 1.5859375, + "reward_std": 0.6694766581058502, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.38879410922527313, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.2900998964905739, + "step": 956, + "token_counts/after_target": 451.5, + "token_counts/after_think": 36.25, + "token_counts/before_target": 1326.25, + "token_counts/before_think": 625.0 + }, + { + "avg_penalty/after_target": 3.488666534423828, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.2715624161064625, + "avg_penalty/before_think": 0.37926260381937027, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.75, + "completions/max_terminated_length": 346.75, + "completions/mean_length": 132.28125, + "completions/mean_terminated_length": 132.28125, + "completions/min_length": 35.75, + "completions/min_terminated_length": 35.75, + "epoch": 0.4785, + "grad_norm": 4.407375335693359, + "kl": 7.080078125, + "learning_rate": 1.2486898871648552e-05, + "loss": 0.8552, + "num_tokens": 30275056.0, + "reward": 1.77734375, + "reward_std": 0.5911993682384491, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.2750816270709038, + "rewards/tag_count_reward/mean": 0.88671875, + "rewards/tag_count_reward/std": 0.2562625780701637, + "step": 957, + "token_counts/after_target": 271.0, + "token_counts/after_think": 16.25, + "token_counts/before_target": 790.0, + "token_counts/before_think": 1039.25 + }, + { + "avg_penalty/after_target": 2.089619994163513, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.2986432984471321, + "avg_penalty/before_think": 0.5548093467950821, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.25, + "completions/max_terminated_length": 328.25, + "completions/mean_length": 119.375, + "completions/mean_terminated_length": 119.375, + "completions/min_length": 28.5, + "completions/min_terminated_length": 28.5, + "epoch": 0.479, + "grad_norm": 2.723158121109009, + "kl": 11.2421875, + "learning_rate": 1.2469990127227432e-05, + "loss": 0.9898, + "num_tokens": 30292232.0, + "reward": 1.73046875, + "reward_std": 0.6640192717313766, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3450859263539314, + "rewards/tag_count_reward/mean": 0.87109375, + "rewards/tag_count_reward/std": 0.32162274420261383, + "step": 958, + "token_counts/after_target": 191.75, + "token_counts/after_think": 26.75, + "token_counts/before_target": 1060.75, + "token_counts/before_think": 630.75 + }, + { + "avg_penalty/after_target": 2.923337399959564, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.265487864613533, + "avg_penalty/before_think": 0.5700800828635693, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 174.46875, + "completions/mean_terminated_length": 174.46875, + "completions/min_length": 45.75, + "completions/min_terminated_length": 45.75, + "epoch": 0.4795, + "grad_norm": 5.477448463439941, + "kl": 6.82421875, + "learning_rate": 1.2453073858788027e-05, + "loss": 0.9577, + "num_tokens": 30315606.0, + "reward": 1.8046875, + "reward_std": 0.491535946726799, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.27156074345111847, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.22775010764598846, + "step": 959, + "token_counts/after_target": 409.5, + "token_counts/after_think": 71.0, + "token_counts/before_target": 1137.75, + "token_counts/before_think": 1173.25 + }, + { + "avg_penalty/after_target": 2.0449873507022858, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5775269716978073, + "avg_penalty/before_think": 0.4518299922347069, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.5, + "completions/max_terminated_length": 548.5, + "completions/mean_length": 201.203125, + "completions/mean_terminated_length": 201.203125, + "completions/min_length": 40.5, + "completions/min_terminated_length": 40.5, + "epoch": 0.48, + "grad_norm": 6.752363204956055, + "kl": 19.765625, + "learning_rate": 1.2436150117860226e-05, + "loss": 1.6041, + "num_tokens": 30341475.0, + "reward": 1.65234375, + "reward_std": 0.7085781693458557, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4141380712389946, + "rewards/tag_count_reward/mean": 0.85546875, + "rewards/tag_count_reward/std": 0.2993130572140217, + "step": 960, + "token_counts/after_target": 655.5, + "token_counts/after_think": 21.25, + "token_counts/before_target": 1120.25, + "token_counts/before_think": 1422.25 + }, + { + "avg_penalty/after_target": 2.6027392745018005, + "avg_penalty/after_think": 3.233057975769043, + "avg_penalty/before_target": 0.29747138172388077, + "avg_penalty/before_think": 0.46016473323106766, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 168.0625, + "completions/mean_terminated_length": 168.0625, + "completions/min_length": 51.75, + "completions/min_terminated_length": 51.75, + "epoch": 0.4805, + "grad_norm": 4.103142261505127, + "kl": 11.29052734375, + "learning_rate": 1.2419218955996677e-05, + "loss": 1.2669, + "num_tokens": 30362087.0, + "reward": 1.8984375, + "reward_std": 0.2711019217967987, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.1632782220840454, + "rewards/tag_count_reward/mean": 0.9609375, + "rewards/tag_count_reward/std": 0.11129852384328842, + "step": 961, + "token_counts/after_target": 310.5, + "token_counts/after_think": 77.5, + "token_counts/before_target": 1262.0, + "token_counts/before_think": 1039.0 + }, + { + "avg_penalty/after_target": 3.2007618248462677, + "avg_penalty/after_think": 3.920959711074829, + "avg_penalty/before_target": 0.338350024074316, + "avg_penalty/before_think": 0.5482895746827126, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.25, + "completions/max_terminated_length": 577.25, + "completions/mean_length": 178.953125, + "completions/mean_terminated_length": 178.953125, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.481, + "grad_norm": 13.183649063110352, + "kl": 25.7734375, + "learning_rate": 1.2402280424772639e-05, + "loss": 1.8818, + "num_tokens": 30387348.0, + "reward": 1.4765625, + "reward_std": 0.8244639486074448, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.46034691482782364, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.3763582818210125, + "step": 962, + "token_counts/after_target": 484.5, + "token_counts/after_think": 55.5, + "token_counts/before_target": 1543.0, + "token_counts/before_think": 780.25 + }, + { + "avg_penalty/after_target": 2.006738394498825, + "avg_penalty/after_think": 2.8778122067451477, + "avg_penalty/before_target": 0.47152237966656685, + "avg_penalty/before_think": 0.8696912154555321, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 648.0, + "completions/max_terminated_length": 607.75, + "completions/mean_length": 223.546875, + "completions/mean_terminated_length": 212.75104522705078, + "completions/min_length": 22.25, + "completions/min_terminated_length": 22.25, + "epoch": 0.4815, + "grad_norm": 24.130128860473633, + "kl": 21.046875, + "learning_rate": 1.238533457578581e-05, + "loss": 1.5014, + "num_tokens": 30418727.0, + "reward": 1.2734375, + "reward_std": 0.6640723496675491, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.35648179799318314, + "rewards/tag_count_reward/mean": 0.6640625, + "rewards/tag_count_reward/std": 0.3296074867248535, + "step": 963, + "token_counts/after_target": 973.0, + "token_counts/after_think": 68.75, + "token_counts/before_target": 1691.5, + "token_counts/before_think": 843.5 + }, + { + "avg_penalty/after_target": 2.7134042978286743, + "avg_penalty/after_think": 3.845842123031616, + "avg_penalty/before_target": 0.28095515072345734, + "avg_penalty/before_think": 0.5380027741193771, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 685.25, + "completions/max_terminated_length": 522.75, + "completions/mean_length": 226.671875, + "completions/mean_terminated_length": 213.03541946411133, + "completions/min_length": 49.25, + "completions/min_terminated_length": 49.25, + "epoch": 0.482, + "grad_norm": 11.318937301635742, + "kl": 19.390625, + "learning_rate": 1.236838146065619e-05, + "loss": 1.4405, + "num_tokens": 30441650.0, + "reward": 1.6796875, + "reward_std": 0.6765464097261429, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.38688503205776215, + "rewards/tag_count_reward/mean": 0.8671875, + "rewards/tag_count_reward/std": 0.3009353503584862, + "step": 964, + "token_counts/after_target": 448.25, + "token_counts/after_think": 143.25, + "token_counts/before_target": 1551.25, + "token_counts/before_think": 1484.0 + }, + { + "avg_penalty/after_target": 2.4454464316368103, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.5146458074450493, + "avg_penalty/before_think": 0.5485655292868614, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 852.0, + "completions/max_terminated_length": 767.25, + "completions/mean_length": 271.0, + "completions/mean_terminated_length": 260.2312545776367, + "completions/min_length": 38.5, + "completions/min_terminated_length": 38.5, + "epoch": 0.4825, + "grad_norm": 17.08168601989746, + "kl": 30.46875, + "learning_rate": 1.23514211310259e-05, + "loss": 2.1668, + "num_tokens": 30472962.0, + "reward": 1.37890625, + "reward_std": 0.8591521382331848, + "rewards/accuracy_reward/mean": NaN, + "rewards/accuracy_reward/std": NaN, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.4876555874943733, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.3966013044118881, + "step": 965, + "token_counts/after_target": 884.25, + "token_counts/after_think": 16.75, + "token_counts/before_target": 1920.75, + "token_counts/before_think": 1514.25 + }, + { + "avg_penalty/after_target": 2.7597576379776, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.540242001414299, + "avg_penalty/before_think": 0.4662882052361965, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 898.5, + "completions/max_terminated_length": 834.25, + "completions/mean_length": 313.765625, + "completions/mean_terminated_length": 270.171875, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.483, + "grad_norm": 28.32604217529297, + "kl": 40.34375, + "learning_rate": 1.2334453638559057e-05, + "loss": 2.6977, + "num_tokens": 30504659.0, + "reward": 1.28515625, + "reward_std": 0.895174577832222, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.49297719448804855, + "rewards/tag_count_reward/mean": 0.69140625, + "rewards/tag_count_reward/std": 0.42818088084459305, + "step": 966, + "token_counts/after_target": 1579.25, + "token_counts/after_think": 4.75, + "token_counts/before_target": 2180.5, + "token_counts/before_think": 1255.75 + }, + { + "avg_penalty/after_target": 2.3117074370384216, + "avg_penalty/after_think": 2.805951476097107, + "avg_penalty/before_target": 0.5184226855635643, + "avg_penalty/before_think": 0.6052487418055534, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 809.75, + "completions/max_terminated_length": 809.75, + "completions/mean_length": 291.765625, + "completions/mean_terminated_length": 291.765625, + "completions/min_length": 32.75, + "completions/min_terminated_length": 32.75, + "epoch": 0.4835, + "grad_norm": 14.973099708557129, + "kl": 29.4375, + "learning_rate": 1.2317479034941572e-05, + "loss": 2.1153, + "num_tokens": 30532276.0, + "reward": 1.33203125, + "reward_std": 0.8907637000083923, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.48989029973745346, + "rewards/tag_count_reward/mean": 0.70703125, + "rewards/tag_count_reward/std": 0.4008079543709755, + "step": 967, + "token_counts/after_target": 999.75, + "token_counts/after_think": 48.75, + "token_counts/before_target": 1588.25, + "token_counts/before_think": 2031.5 + }, + { + "avg_penalty/after_target": 2.8025290966033936, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.528653934597969, + "avg_penalty/before_think": 0.5373520851135254, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 819.25, + "completions/max_terminated_length": 691.5, + "completions/mean_length": 329.6875, + "completions/mean_terminated_length": 294.88185119628906, + "completions/min_length": 45.75, + "completions/min_terminated_length": 45.75, + "epoch": 0.484, + "grad_norm": 7.807101249694824, + "kl": 32.375, + "learning_rate": 1.2300497371881046e-05, + "loss": 2.55, + "num_tokens": 30561424.0, + "reward": 1.3203125, + "reward_std": 0.8789457827806473, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.4939897432923317, + "rewards/tag_count_reward/mean": 0.7109375, + "rewards/tag_count_reward/std": 0.40868861228227615, + "step": 968, + "token_counts/after_target": 1599.0, + "token_counts/after_think": 72.5, + "token_counts/before_target": 2505.0, + "token_counts/before_think": 1098.5 + }, + { + "avg_penalty/after_target": 2.8402529060840607, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3953622579574585, + "avg_penalty/before_think": 0.5395996570587158, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 713.0, + "completions/max_terminated_length": 583.75, + "completions/mean_length": 275.0, + "completions/mean_terminated_length": 263.1197967529297, + "completions/min_length": 57.25, + "completions/min_terminated_length": 57.25, + "epoch": 0.4845, + "grad_norm": 7.106358051300049, + "kl": 21.75, + "learning_rate": 1.2283508701106559e-05, + "loss": 2.0215, + "num_tokens": 30588944.0, + "reward": 1.55859375, + "reward_std": 0.7518309354782104, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4414467439055443, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.33382977545261383, + "step": 969, + "token_counts/after_target": 876.0, + "token_counts/after_think": 131.25, + "token_counts/before_target": 1975.25, + "token_counts/before_think": 1417.5 + }, + { + "avg_penalty/after_target": 2.6121257841587067, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3731287196278572, + "avg_penalty/before_think": 0.637649804353714, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 578.5, + "completions/max_terminated_length": 578.5, + "completions/mean_length": 237.390625, + "completions/mean_terminated_length": 237.390625, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.485, + "grad_norm": 6.468034267425537, + "kl": 16.34375, + "learning_rate": 1.2266513074368552e-05, + "loss": 1.5629, + "num_tokens": 30615081.0, + "reward": 1.5078125, + "reward_std": 0.7978933900594711, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4625816270709038, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.3696608543395996, + "step": 970, + "token_counts/after_target": 735.75, + "token_counts/after_think": 159.0, + "token_counts/before_target": 1686.75, + "token_counts/before_think": 1216.75 + }, + { + "avg_penalty/after_target": 2.4591354727745056, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3400331810116768, + "avg_penalty/before_think": 0.6516178473830223, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 783.0, + "completions/max_terminated_length": 653.75, + "completions/mean_length": 247.296875, + "completions/mean_terminated_length": 234.97291946411133, + "completions/min_length": 51.5, + "completions/min_terminated_length": 51.5, + "epoch": 0.4855, + "grad_norm": 5.716432094573975, + "kl": 17.3125, + "learning_rate": 1.2249510543438652e-05, + "loss": 1.6786, + "num_tokens": 30642876.0, + "reward": 1.5703125, + "reward_std": 0.7801995277404785, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4440634250640869, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.35558905452489853, + "step": 971, + "token_counts/after_target": 576.5, + "token_counts/after_think": 291.0, + "token_counts/before_target": 1377.75, + "token_counts/before_think": 1711.5 + }, + { + "avg_penalty/after_target": 2.7905723452568054, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.40488024055957794, + "avg_penalty/before_think": 0.7033679932355881, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 922.0, + "completions/max_terminated_length": 879.5, + "completions/mean_length": 337.84375, + "completions/mean_terminated_length": 316.17188262939453, + "completions/min_length": 74.25, + "completions/min_terminated_length": 74.25, + "epoch": 0.486, + "grad_norm": 9.790815353393555, + "kl": 19.5, + "learning_rate": 1.2232501160109516e-05, + "loss": 1.8856, + "num_tokens": 30675490.0, + "reward": 1.5390625, + "reward_std": 0.7589712589979172, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.32644279673695564, + "step": 972, + "token_counts/after_target": 1169.75, + "token_counts/after_think": 206.0, + "token_counts/before_target": 1958.25, + "token_counts/before_think": 2071.5 + }, + { + "avg_penalty/after_target": 1.8547545671463013, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5013121590018272, + "avg_penalty/before_think": 0.624756470322609, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 841.25, + "completions/max_terminated_length": 772.5, + "completions/mean_length": 288.21875, + "completions/mean_terminated_length": 277.4260482788086, + "completions/min_length": 43.75, + "completions/min_terminated_length": 43.75, + "epoch": 0.4865, + "grad_norm": 7.884925365447998, + "kl": 17.15625, + "learning_rate": 1.2215484976194675e-05, + "loss": 1.6787, + "num_tokens": 30705104.0, + "reward": 1.5234375, + "reward_std": 0.7498479038476944, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.45283494144678116, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.35589924454689026, + "step": 973, + "token_counts/after_target": 710.5, + "token_counts/after_think": 292.25, + "token_counts/before_target": 1800.25, + "token_counts/before_think": 1808.5 + }, + { + "avg_penalty/after_target": 2.597216099500656, + "avg_penalty/after_think": 2.501859128475189, + "avg_penalty/before_target": 0.5177290104329586, + "avg_penalty/before_think": 0.7027246057987213, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 810.75, + "completions/max_terminated_length": 795.75, + "completions/mean_length": 308.875, + "completions/mean_terminated_length": 298.02083587646484, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.487, + "grad_norm": 7.739646911621094, + "kl": 21.1875, + "learning_rate": 1.2198462043528376e-05, + "loss": 2.0291, + "num_tokens": 30733160.0, + "reward": 1.578125, + "reward_std": 0.8829536736011505, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.11967839300632477, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4682852029800415, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.3746466040611267, + "step": 974, + "token_counts/after_target": 1180.5, + "token_counts/after_think": 229.0, + "token_counts/before_target": 1643.5, + "token_counts/before_think": 1889.0 + }, + { + "avg_penalty/after_target": 1.9848805367946625, + "avg_penalty/after_think": 2.5894248485565186, + "avg_penalty/before_target": 0.3462498188018799, + "avg_penalty/before_think": 0.6108062118291855, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 758.75, + "completions/max_terminated_length": 736.25, + "completions/mean_length": 260.34375, + "completions/mean_terminated_length": 249.04792022705078, + "completions/min_length": 72.25, + "completions/min_terminated_length": 72.25, + "epoch": 0.4875, + "grad_norm": 5.338080406188965, + "kl": 22.390625, + "learning_rate": 1.2181432413965428e-05, + "loss": 1.8093, + "num_tokens": 30758078.0, + "reward": 1.47265625, + "reward_std": 0.7580147981643677, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.4393647313117981, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.36337798088788986, + "step": 975, + "token_counts/after_target": 462.5, + "token_counts/after_think": 229.75, + "token_counts/before_target": 1936.5, + "token_counts/before_think": 1536.75 + }, + { + "avg_penalty/after_target": 1.5708581805229187, + "avg_penalty/after_think": 3.703922152519226, + "avg_penalty/before_target": 0.64799714833498, + "avg_penalty/before_think": 0.48663773387670517, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 917.0, + "completions/max_terminated_length": 815.75, + "completions/mean_length": 303.59375, + "completions/mean_terminated_length": 270.8842315673828, + "completions/min_length": 76.75, + "completions/min_terminated_length": 76.75, + "epoch": 0.488, + "grad_norm": 9.081774711608887, + "kl": 31.71875, + "learning_rate": 1.2164396139381029e-05, + "loss": 2.4882, + "num_tokens": 30790788.0, + "reward": 1.4765625, + "reward_std": 0.7996680289506912, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4761601909995079, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.36598214507102966, + "step": 976, + "token_counts/after_target": 1177.5, + "token_counts/after_think": 88.0, + "token_counts/before_target": 2337.5, + "token_counts/before_think": 1254.5 + }, + { + "avg_penalty/after_target": 3.3501288294792175, + "avg_penalty/after_think": 2.5301671028137207, + "avg_penalty/before_target": 0.29487861320376396, + "avg_penalty/before_think": 0.4662986397743225, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 604.25, + "completions/max_terminated_length": 525.5, + "completions/mean_length": 273.90625, + "completions/mean_terminated_length": 254.81919860839844, + "completions/min_length": 82.75, + "completions/min_terminated_length": 82.75, + "epoch": 0.4885, + "grad_norm": 13.845564842224121, + "kl": 27.0, + "learning_rate": 1.2147353271670634e-05, + "loss": 1.9214, + "num_tokens": 30817294.0, + "reward": 1.42578125, + "reward_std": 0.8261671215295792, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.45247192680835724, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.3889564797282219, + "step": 977, + "token_counts/after_target": 769.5, + "token_counts/after_think": 33.75, + "token_counts/before_target": 2341.25, + "token_counts/before_think": 1238.0 + }, + { + "avg_penalty/after_target": 2.7405886054039, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3769070953130722, + "avg_penalty/before_think": 0.40625883638858795, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 554.75, + "completions/max_terminated_length": 554.75, + "completions/mean_length": 267.953125, + "completions/mean_terminated_length": 267.953125, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.489, + "grad_norm": 5.093008041381836, + "kl": 19.984375, + "learning_rate": 1.2130303862749769e-05, + "loss": 1.6463, + "num_tokens": 30844091.0, + "reward": 1.52734375, + "reward_std": 0.7868666350841522, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44187305867671967, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.35828715562820435, + "step": 978, + "token_counts/after_target": 762.0, + "token_counts/after_think": 65.25, + "token_counts/before_target": 1938.75, + "token_counts/before_think": 1521.25 + }, + { + "avg_penalty/after_target": 2.194432318210602, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5671889781951904, + "avg_penalty/before_think": 0.5256944075226784, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 788.5, + "completions/max_terminated_length": 772.25, + "completions/mean_length": 296.125, + "completions/mean_terminated_length": 287.2072982788086, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.4895, + "grad_norm": 4.4431233406066895, + "kl": 24.25, + "learning_rate": 1.211324796455389e-05, + "loss": 2.0374, + "num_tokens": 30871379.0, + "reward": 1.38671875, + "reward_std": 0.8403182923793793, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.4745560586452484, + "rewards/tag_count_reward/mean": 0.73046875, + "rewards/tag_count_reward/std": 0.3837076500058174, + "step": 979, + "token_counts/after_target": 1169.25, + "token_counts/after_think": 175.0, + "token_counts/before_target": 1892.75, + "token_counts/before_think": 1501.0 + }, + { + "avg_penalty/after_target": 2.486310601234436, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3307088017463684, + "avg_penalty/before_think": 0.4288194477558136, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 635.25, + "completions/max_terminated_length": 635.25, + "completions/mean_length": 276.984375, + "completions/mean_terminated_length": 276.984375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.49, + "grad_norm": 5.951283931732178, + "kl": 20.046875, + "learning_rate": 1.2096185629038219e-05, + "loss": 1.5742, + "num_tokens": 30899538.0, + "reward": 1.44140625, + "reward_std": 0.7672930508852005, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.43526528775691986, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.35411563515663147, + "step": 980, + "token_counts/after_target": 583.25, + "token_counts/after_think": 57.75, + "token_counts/before_target": 1876.0, + "token_counts/before_think": 1914.75 + }, + { + "avg_penalty/after_target": 2.3782802522182465, + "avg_penalty/after_think": 3.6971245408058167, + "avg_penalty/before_target": 0.4196862205862999, + "avg_penalty/before_think": 0.39990948140621185, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.25, + "completions/max_terminated_length": 572.25, + "completions/mean_length": 202.46875, + "completions/mean_terminated_length": 202.46875, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.4905, + "grad_norm": 5.0642409324646, + "kl": 16.8125, + "learning_rate": 1.2079116908177592e-05, + "loss": 1.5314, + "num_tokens": 30919504.0, + "reward": 1.546875, + "reward_std": 0.7789806127548218, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.40316852182149887, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.38162607699632645, + "step": 981, + "token_counts/after_target": 605.0, + "token_counts/after_think": 98.75, + "token_counts/before_target": 1527.75, + "token_counts/before_think": 1008.0 + }, + { + "avg_penalty/after_target": 2.604473203420639, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.32776010036468506, + "avg_penalty/before_think": 0.4859791174530983, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.5, + "completions/max_terminated_length": 499.5, + "completions/mean_length": 230.125, + "completions/mean_terminated_length": 230.125, + "completions/min_length": 48.5, + "completions/min_terminated_length": 48.5, + "epoch": 0.491, + "grad_norm": 9.434720993041992, + "kl": 14.046875, + "learning_rate": 1.2062041853966298e-05, + "loss": 1.4861, + "num_tokens": 30942984.0, + "reward": 1.4921875, + "reward_std": 0.7970504313707352, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.46034691482782364, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.3584568053483963, + "step": 982, + "token_counts/after_target": 603.0, + "token_counts/after_think": 200.0, + "token_counts/before_target": 1749.0, + "token_counts/before_think": 1130.0 + }, + { + "avg_penalty/after_target": 2.2750879526138306, + "avg_penalty/after_think": 2.925437331199646, + "avg_penalty/before_target": 0.4220321998000145, + "avg_penalty/before_think": 0.5568186938762665, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 685.0, + "completions/max_terminated_length": 527.75, + "completions/mean_length": 261.6875, + "completions/mean_terminated_length": 248.60520935058594, + "completions/min_length": 56.5, + "completions/min_terminated_length": 56.5, + "epoch": 0.4915, + "grad_norm": 12.784745216369629, + "kl": 15.890625, + "learning_rate": 1.2044960518417902e-05, + "loss": 1.7446, + "num_tokens": 30968484.0, + "reward": 1.5703125, + "reward_std": 0.767589196562767, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4079566150903702, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.3714434579014778, + "step": 983, + "token_counts/after_target": 778.0, + "token_counts/after_think": 216.25, + "token_counts/before_target": 2006.25, + "token_counts/before_think": 1186.5 + }, + { + "avg_penalty/after_target": 2.505357414484024, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.35185596346855164, + "avg_penalty/before_think": 0.3909422755241394, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.5, + "completions/max_terminated_length": 543.5, + "completions/mean_length": 192.5625, + "completions/mean_terminated_length": 192.5625, + "completions/min_length": 46.75, + "completions/min_terminated_length": 46.75, + "epoch": 0.492, + "grad_norm": 4.535998821258545, + "kl": 17.109375, + "learning_rate": 1.2027872953565125e-05, + "loss": 1.5654, + "num_tokens": 30993576.0, + "reward": 1.48046875, + "reward_std": 0.80887171626091, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4634971097111702, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.35895371437072754, + "step": 984, + "token_counts/after_target": 408.0, + "token_counts/after_think": 50.75, + "token_counts/before_target": 1942.0, + "token_counts/before_think": 680.25 + }, + { + "avg_penalty/after_target": 2.3268179297447205, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.5288293361663818, + "avg_penalty/before_think": 0.49963629990816116, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 610.5, + "completions/max_terminated_length": 552.25, + "completions/mean_length": 227.953125, + "completions/mean_terminated_length": 217.05937957763672, + "completions/min_length": 45.75, + "completions/min_terminated_length": 45.75, + "epoch": 0.4925, + "grad_norm": 9.891129493713379, + "kl": 16.125, + "learning_rate": 1.2010779211459649e-05, + "loss": 1.6783, + "num_tokens": 31021381.0, + "reward": 1.57421875, + "reward_std": 0.7593326270580292, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4260597825050354, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.3612324520945549, + "step": 985, + "token_counts/after_target": 987.0, + "token_counts/after_think": 123.25, + "token_counts/before_target": 1592.75, + "token_counts/before_think": 944.25 + }, + { + "avg_penalty/after_target": 3.0249610543251038, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.4585929438471794, + "avg_penalty/before_think": 0.32952403277158737, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.75, + "completions/max_terminated_length": 479.75, + "completions/mean_length": 216.078125, + "completions/mean_terminated_length": 216.078125, + "completions/min_length": 60.25, + "completions/min_terminated_length": 60.25, + "epoch": 0.493, + "grad_norm": 4.035488605499268, + "kl": 19.921875, + "learning_rate": 1.1993679344171973e-05, + "loss": 1.7803, + "num_tokens": 31042730.0, + "reward": 1.39453125, + "reward_std": 0.8613072633743286, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.46875541657209396, + "rewards/tag_count_reward/mean": 0.72265625, + "rewards/tag_count_reward/std": 0.4030046910047531, + "step": 986, + "token_counts/after_target": 903.0, + "token_counts/after_think": 22.25, + "token_counts/before_target": 1654.0, + "token_counts/before_think": 878.0 + }, + { + "avg_penalty/after_target": 3.1260687708854675, + "avg_penalty/after_think": 3.8406697511672974, + "avg_penalty/before_target": 0.23826773092150688, + "avg_penalty/before_think": 0.32886261492967606, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.75, + "completions/max_terminated_length": 514.75, + "completions/mean_length": 196.15625, + "completions/mean_terminated_length": 196.15625, + "completions/min_length": 33.5, + "completions/min_terminated_length": 33.5, + "epoch": 0.4935, + "grad_norm": 11.04300594329834, + "kl": 19.359375, + "learning_rate": 1.1976573403791263e-05, + "loss": 1.3176, + "num_tokens": 31067044.0, + "reward": 1.42578125, + "reward_std": 0.87924624979496, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.46822190284729004, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.4250713586807251, + "step": 987, + "token_counts/after_target": 295.0, + "token_counts/after_think": 139.25, + "token_counts/before_target": 1691.25, + "token_counts/before_think": 1013.0 + }, + { + "avg_penalty/after_target": 3.154203236103058, + "avg_penalty/after_think": 3.940959095954895, + "avg_penalty/before_target": 0.40659475326538086, + "avg_penalty/before_think": 0.45485468953847885, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.5, + "completions/max_terminated_length": 489.5, + "completions/mean_length": 183.203125, + "completions/mean_terminated_length": 183.203125, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.494, + "grad_norm": 7.667595386505127, + "kl": 22.15625, + "learning_rate": 1.1959461442425178e-05, + "loss": 1.9865, + "num_tokens": 31089169.0, + "reward": 1.49609375, + "reward_std": 0.7983611524105072, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.44495995342731476, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.3756765201687813, + "step": 988, + "token_counts/after_target": 571.0, + "token_counts/after_think": 75.75, + "token_counts/before_target": 1550.0, + "token_counts/before_think": 734.5 + }, + { + "avg_penalty/after_target": 3.073459804058075, + "avg_penalty/after_think": 2.4171112179756165, + "avg_penalty/before_target": 0.24611206352710724, + "avg_penalty/before_think": 0.4679928421974182, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 146.46875, + "completions/mean_terminated_length": 146.46875, + "completions/min_length": 37.25, + "completions/min_terminated_length": 37.25, + "epoch": 0.4945, + "grad_norm": 6.137958526611328, + "kl": 16.921875, + "learning_rate": 1.194234351219972e-05, + "loss": 1.3214, + "num_tokens": 31111743.0, + "reward": 1.625, + "reward_std": 0.7197436541318893, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.3890564441680908, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.334683146327734, + "step": 989, + "token_counts/after_target": 196.25, + "token_counts/after_think": 64.75, + "token_counts/before_target": 1170.25, + "token_counts/before_think": 912.25 + }, + { + "avg_penalty/after_target": 2.7332699298858643, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3048560582101345, + "avg_penalty/before_think": 0.3919045925140381, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.75, + "completions/max_terminated_length": 591.75, + "completions/mean_length": 210.828125, + "completions/mean_terminated_length": 210.828125, + "completions/min_length": 50.25, + "completions/min_terminated_length": 50.25, + "epoch": 0.495, + "grad_norm": 9.114163398742676, + "kl": 26.71875, + "learning_rate": 1.1925219665259076e-05, + "loss": 2.0019, + "num_tokens": 31134516.0, + "reward": 1.4609375, + "reward_std": 0.8794183284044266, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.42870646715164185, + "step": 990, + "token_counts/after_target": 498.25, + "token_counts/after_think": 99.25, + "token_counts/before_target": 1929.75, + "token_counts/before_think": 846.0 + }, + { + "avg_penalty/after_target": 2.58782297372818, + "avg_penalty/after_think": 3.372378885746002, + "avg_penalty/before_target": 0.4909139350056648, + "avg_penalty/before_think": 0.38502003997564316, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.0, + "completions/max_terminated_length": 618.0, + "completions/mean_length": 201.515625, + "completions/mean_terminated_length": 201.515625, + "completions/min_length": 60.75, + "completions/min_terminated_length": 60.75, + "epoch": 0.4955, + "grad_norm": 8.162313461303711, + "kl": 29.4375, + "learning_rate": 1.190808995376545e-05, + "loss": 2.3209, + "num_tokens": 31156789.0, + "reward": 1.4140625, + "reward_std": 0.8949063867330551, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.47360680997371674, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.43312741070985794, + "step": 991, + "token_counts/after_target": 715.25, + "token_counts/after_think": 71.0, + "token_counts/before_target": 1688.75, + "token_counts/before_think": 749.25 + }, + { + "avg_penalty/after_target": 2.2913128435611725, + "avg_penalty/after_think": 3.9152286648750305, + "avg_penalty/before_target": 0.4285476580262184, + "avg_penalty/before_think": 0.4779798164963722, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.75, + "completions/max_terminated_length": 477.75, + "completions/mean_length": 169.40625, + "completions/mean_terminated_length": 169.40625, + "completions/min_length": 45.25, + "completions/min_terminated_length": 45.25, + "epoch": 0.496, + "grad_norm": 3.2840189933776855, + "kl": 19.46875, + "learning_rate": 1.1890954429898914e-05, + "loss": 1.7828, + "num_tokens": 31177791.0, + "reward": 1.65625, + "reward_std": 0.722085177898407, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3943893313407898, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.33259278163313866, + "step": 992, + "token_counts/after_target": 508.0, + "token_counts/after_think": 63.25, + "token_counts/before_target": 1444.5, + "token_counts/before_think": 694.75 + }, + { + "avg_penalty/after_target": 3.683283805847168, + "avg_penalty/after_think": 2.9510934948921204, + "avg_penalty/before_target": 0.3290525749325752, + "avg_penalty/before_think": 0.38588568568229675, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.25, + "completions/max_terminated_length": 396.25, + "completions/mean_length": 173.5, + "completions/mean_terminated_length": 173.5, + "completions/min_length": 47.75, + "completions/min_terminated_length": 47.75, + "epoch": 0.4965, + "grad_norm": 3.4288218021392822, + "kl": 19.46875, + "learning_rate": 1.187381314585725e-05, + "loss": 1.7472, + "num_tokens": 31197775.0, + "reward": 1.5859375, + "reward_std": 0.769734799861908, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.43303824216127396, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.3592139035463333, + "step": 993, + "token_counts/after_target": 438.25, + "token_counts/after_think": 18.0, + "token_counts/before_target": 1639.25, + "token_counts/before_think": 680.5 + }, + { + "avg_penalty/after_target": 2.876053273677826, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.5251477956771851, + "avg_penalty/before_think": 0.5142403766512871, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 769.25, + "completions/max_terminated_length": 683.5, + "completions/mean_length": 240.765625, + "completions/mean_terminated_length": 228.65521240234375, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.497, + "grad_norm": 9.960227012634277, + "kl": 33.15625, + "learning_rate": 1.1856666153855776e-05, + "loss": 2.6106, + "num_tokens": 31225344.0, + "reward": 1.37890625, + "reward_std": 0.9198508858680725, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.48148179799318314, + "rewards/tag_count_reward/mean": 0.70703125, + "rewards/tag_count_reward/std": 0.45191408693790436, + "step": 994, + "token_counts/after_target": 1036.0, + "token_counts/after_think": 37.75, + "token_counts/before_target": 2111.25, + "token_counts/before_think": 667.25 + }, + { + "avg_penalty/after_target": 2.0372577011585236, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.30623817443847656, + "avg_penalty/before_think": 0.5893804505467415, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.25, + "completions/max_terminated_length": 406.25, + "completions/mean_length": 150.84375, + "completions/mean_terminated_length": 150.84375, + "completions/min_length": 41.25, + "completions/min_terminated_length": 41.25, + "epoch": 0.4975, + "grad_norm": 6.997900009155273, + "kl": 21.625, + "learning_rate": 1.1839513506127202e-05, + "loss": 1.6591, + "num_tokens": 31245414.0, + "reward": 1.546875, + "reward_std": 0.9259322434663773, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.11967839300632477, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.47083858400583267, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.40411460399627686, + "step": 995, + "token_counts/after_target": 209.25, + "token_counts/after_think": 167.0, + "token_counts/before_target": 1418.25, + "token_counts/before_think": 619.0 + }, + { + "avg_penalty/after_target": 3.0172005891799927, + "avg_penalty/after_think": 1.9952808022499084, + "avg_penalty/before_target": 0.26088665053248405, + "avg_penalty/before_think": 0.38428985327482224, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.5, + "completions/max_terminated_length": 582.5, + "completions/mean_length": 191.671875, + "completions/mean_terminated_length": 191.671875, + "completions/min_length": 61.5, + "completions/min_terminated_length": 61.5, + "epoch": 0.498, + "grad_norm": 2.5328292846679688, + "kl": 21.4921875, + "learning_rate": 1.1822355254921478e-05, + "loss": 1.8533, + "num_tokens": 31268129.0, + "reward": 1.62109375, + "reward_std": 0.7447032034397125, + "rewards/accuracy_reward/mean": NaN, + "rewards/accuracy_reward/std": NaN, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.39476002007722855, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.36017677932977676, + "step": 996, + "token_counts/after_target": 507.5, + "token_counts/after_think": 33.5, + "token_counts/before_target": 1742.5, + "token_counts/before_think": 783.25 + }, + { + "avg_penalty/after_target": 2.1099972426891327, + "avg_penalty/after_think": 1.7793083786964417, + "avg_penalty/before_target": 0.44104374200105667, + "avg_penalty/before_think": 0.3189419209957123, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 588.0, + "completions/max_terminated_length": 577.25, + "completions/mean_length": 220.375, + "completions/mean_terminated_length": 198.8169708251953, + "completions/min_length": 67.75, + "completions/min_terminated_length": 67.75, + "epoch": 0.4985, + "grad_norm": 5.5180511474609375, + "kl": 27.84375, + "learning_rate": 1.1805191452505602e-05, + "loss": 2.1122, + "num_tokens": 31295353.0, + "reward": 1.46875, + "reward_std": 0.8711400628089905, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4604102149605751, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.4168848618865013, + "step": 997, + "token_counts/after_target": 776.0, + "token_counts/after_think": 11.75, + "token_counts/before_target": 1951.0, + "token_counts/before_think": 787.25 + }, + { + "avg_penalty/after_target": 3.4064043760299683, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.40120580047369003, + "avg_penalty/before_think": 0.4543878249824047, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 756.5, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 201.09375, + "completions/mean_terminated_length": 175.01771545410156, + "completions/min_length": 56.75, + "completions/min_terminated_length": 56.75, + "epoch": 0.499, + "grad_norm": 11.692556381225586, + "kl": 24.78125, + "learning_rate": 1.1788022151163497e-05, + "loss": 2.4634, + "num_tokens": 31318447.0, + "reward": 1.59375, + "reward_std": 0.781768798828125, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.372066430747509, + "step": 998, + "token_counts/after_target": 774.25, + "token_counts/after_think": 43.25, + "token_counts/before_target": 1657.25, + "token_counts/before_think": 742.75 + }, + { + "avg_penalty/after_target": 1.5712783634662628, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4787730425596237, + "avg_penalty/before_think": 0.36765381693840027, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.5, + "completions/max_terminated_length": 751.5, + "completions/mean_length": 191.9375, + "completions/mean_terminated_length": 191.9375, + "completions/min_length": 44.75, + "completions/min_terminated_length": 44.75, + "epoch": 0.4995, + "grad_norm": 2.9241678714752197, + "kl": 23.5625, + "learning_rate": 1.1770847403195836e-05, + "loss": 1.9958, + "num_tokens": 31340491.0, + "reward": 1.64453125, + "reward_std": 0.756582498550415, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3987511098384857, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.3612314313650131, + "step": 999, + "token_counts/after_target": 426.75, + "token_counts/after_think": 40.25, + "token_counts/before_target": 1680.0, + "token_counts/before_think": 924.0 + }, + { + "avg_penalty/after_target": 3.1780182123184204, + "avg_penalty/after_think": 1.995543360710144, + "avg_penalty/before_target": 0.31047895178198814, + "avg_penalty/before_think": 0.43388403952121735, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 608.25, + "completions/max_terminated_length": 532.5, + "completions/mean_length": 183.21875, + "completions/mean_terminated_length": 170.04687881469727, + "completions/min_length": 58.25, + "completions/min_terminated_length": 58.25, + "epoch": 0.5, + "grad_norm": 14.042970657348633, + "kl": 20.90625, + "learning_rate": 1.1753667260919872e-05, + "loss": 2.2045, + "num_tokens": 31362665.0, + "reward": 1.640625, + "reward_std": 0.757043182849884, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3943893313407898, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.3669255003333092, + "step": 1000, + "token_counts/after_target": 760.75, + "token_counts/after_think": 37.25, + "token_counts/before_target": 1379.75, + "token_counts/before_think": 753.75 + }, + { + "avg_penalty/after_target": 1.9895598590373993, + "avg_penalty/after_think": 3.896423041820526, + "avg_penalty/before_target": 0.338092602789402, + "avg_penalty/before_think": 0.4085860252380371, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 680.0, + "completions/max_terminated_length": 680.0, + "completions/mean_length": 172.96875, + "completions/mean_terminated_length": 172.96875, + "completions/min_length": 41.25, + "completions/min_terminated_length": 41.25, + "epoch": 0.5005, + "grad_norm": 3.356595277786255, + "kl": 28.5625, + "learning_rate": 1.1736481776669307e-05, + "loss": 2.4479, + "num_tokens": 31385687.0, + "reward": 1.61328125, + "reward_std": 0.7458162158727646, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.38724804669618607, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.36018431186676025, + "step": 1001, + "token_counts/after_target": 412.75, + "token_counts/after_think": 22.25, + "token_counts/before_target": 1804.25, + "token_counts/before_think": 528.25 + }, + { + "avg_penalty/after_target": 2.3900834023952484, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.34449707344174385, + "avg_penalty/before_think": 0.6199318841099739, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.25, + "completions/max_terminated_length": 628.25, + "completions/mean_length": 197.28125, + "completions/mean_terminated_length": 197.28125, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.501, + "grad_norm": 4.7969889640808105, + "kl": 13.0166015625, + "learning_rate": 1.1719291002794096e-05, + "loss": 1.4964, + "num_tokens": 31409225.0, + "reward": 1.81640625, + "reward_std": 0.48938964307308197, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.24866948276758194, + "rewards/tag_count_reward/mean": 0.91015625, + "rewards/tag_count_reward/std": 0.24120622873306274, + "step": 1002, + "token_counts/after_target": 477.25, + "token_counts/after_think": 105.25, + "token_counts/before_target": 1741.0, + "token_counts/before_think": 833.0 + }, + { + "avg_penalty/after_target": 3.309955656528473, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.4970690794289112, + "avg_penalty/before_think": 0.3746064081788063, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 796.5, + "completions/max_terminated_length": 796.5, + "completions/mean_length": 200.875, + "completions/mean_terminated_length": 200.875, + "completions/min_length": 50.5, + "completions/min_terminated_length": 50.5, + "epoch": 0.5015, + "grad_norm": 14.478066444396973, + "kl": 29.21875, + "learning_rate": 1.1702094991660326e-05, + "loss": 2.9022, + "num_tokens": 31433809.0, + "reward": 1.63671875, + "reward_std": 0.7696407586336136, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3987511098384857, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.37676475942134857, + "step": 1003, + "token_counts/after_target": 985.5, + "token_counts/after_think": 3.0, + "token_counts/before_target": 1588.5, + "token_counts/before_think": 637.0 + }, + { + "avg_penalty/after_target": 2.8914204239845276, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.43665197864174843, + "avg_penalty/before_think": 0.36570774763822556, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 693.5, + "completions/max_terminated_length": 666.75, + "completions/mean_length": 236.40625, + "completions/mean_terminated_length": 225.17083740234375, + "completions/min_length": 50.25, + "completions/min_terminated_length": 50.25, + "epoch": 0.502, + "grad_norm": 7.846822738647461, + "kl": 26.8125, + "learning_rate": 1.1684893795650028e-05, + "loss": 2.1906, + "num_tokens": 31459483.0, + "reward": 1.578125, + "reward_std": 0.7239035815000534, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.40316852182149887, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.33170055598020554, + "step": 1004, + "token_counts/after_target": 817.25, + "token_counts/after_think": 39.0, + "token_counts/before_target": 2105.5, + "token_counts/before_think": 820.75 + }, + { + "avg_penalty/after_target": 3.0039168298244476, + "avg_penalty/after_think": 2.9926087856292725, + "avg_penalty/before_target": 0.28053920343518257, + "avg_penalty/before_think": 0.44802429527044296, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.5, + "completions/max_terminated_length": 571.5, + "completions/mean_length": 172.875, + "completions/mean_terminated_length": 172.875, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.5025, + "grad_norm": 5.945418834686279, + "kl": 13.201171875, + "learning_rate": 1.1667687467161025e-05, + "loss": 1.4457, + "num_tokens": 31478323.0, + "reward": 1.84375, + "reward_std": 0.5457825064659119, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.27289126068353653, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.27289126068353653, + "step": 1005, + "token_counts/after_target": 345.5, + "token_counts/after_think": 28.5, + "token_counts/before_target": 1448.25, + "token_counts/before_think": 943.75 + }, + { + "avg_penalty/after_target": 2.5042415261268616, + "avg_penalty/after_think": 3.816153585910797, + "avg_penalty/before_target": 0.48308586701750755, + "avg_penalty/before_think": 0.37746575474739075, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 772.0, + "completions/max_terminated_length": 575.5, + "completions/mean_length": 201.421875, + "completions/mean_terminated_length": 175.7677116394043, + "completions/min_length": 43.75, + "completions/min_terminated_length": 43.75, + "epoch": 0.503, + "grad_norm": 9.439269065856934, + "kl": 30.296875, + "learning_rate": 1.1650476058606776e-05, + "loss": 2.3491, + "num_tokens": 31507342.0, + "reward": 1.61328125, + "reward_std": 0.7849788814783096, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4097762927412987, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.3774123564362526, + "step": 1006, + "token_counts/after_target": 669.0, + "token_counts/after_think": 24.25, + "token_counts/before_target": 1706.75, + "token_counts/before_think": 822.75 + }, + { + "avg_penalty/after_target": 2.870973974466324, + "avg_penalty/after_think": 3.9538350105285645, + "avg_penalty/before_target": 0.40788908675312996, + "avg_penalty/before_think": 0.4441780522465706, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 142.9375, + "completions/mean_terminated_length": 142.9375, + "completions/min_length": 44.25, + "completions/min_terminated_length": 44.25, + "epoch": 0.5035, + "grad_norm": 8.20310115814209, + "kl": 13.458984375, + "learning_rate": 1.1633259622416224e-05, + "loss": 1.6942, + "num_tokens": 31527498.0, + "reward": 1.859375, + "reward_std": 0.4151124209165573, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.2257782220840454, + "rewards/tag_count_reward/mean": 0.9375, + "rewards/tag_count_reward/std": 0.1946709007024765, + "step": 1007, + "token_counts/after_target": 454.0, + "token_counts/after_think": 113.5, + "token_counts/before_target": 1110.0, + "token_counts/before_think": 609.5 + }, + { + "avg_penalty/after_target": 2.602546453475952, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.2898172102868557, + "avg_penalty/before_think": 0.4175032377243042, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.5, + "completions/max_terminated_length": 545.5, + "completions/mean_length": 143.390625, + "completions/mean_terminated_length": 143.390625, + "completions/min_length": 44.25, + "completions/min_terminated_length": 44.25, + "epoch": 0.504, + "grad_norm": 9.276961326599121, + "kl": 24.1875, + "learning_rate": 1.1616038211033613e-05, + "loss": 1.8638, + "num_tokens": 31545347.0, + "reward": 1.6953125, + "reward_std": 0.7191491574048996, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36797718703746796, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.35210946947336197, + "step": 1008, + "token_counts/after_target": 296.5, + "token_counts/after_think": 26.0, + "token_counts/before_target": 1401.0, + "token_counts/before_think": 570.75 + }, + { + "avg_penalty/after_target": 2.3145511746406555, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.445513091981411, + "avg_penalty/before_think": 0.35300374031066895, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 756.25, + "completions/max_terminated_length": 678.25, + "completions/mean_length": 218.53125, + "completions/mean_terminated_length": 205.69791793823242, + "completions/min_length": 58.25, + "completions/min_terminated_length": 58.25, + "epoch": 0.5045, + "grad_norm": 7.020692348480225, + "kl": 28.8125, + "learning_rate": 1.159881187691835e-05, + "loss": 2.3373, + "num_tokens": 31571237.0, + "reward": 1.66796875, + "reward_std": 0.741533949971199, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38336414843797684, + "rewards/tag_count_reward/mean": 0.83984375, + "rewards/tag_count_reward/std": 0.36079762876033783, + "step": 1009, + "token_counts/after_target": 718.5, + "token_counts/after_think": 62.75, + "token_counts/before_target": 1637.5, + "token_counts/before_think": 1077.75 + }, + { + "avg_penalty/after_target": 2.3486265540122986, + "avg_penalty/after_think": 2.8291186690330505, + "avg_penalty/before_target": 0.2447037771344185, + "avg_penalty/before_think": 0.42169956862926483, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.0, + "completions/max_terminated_length": 539.0, + "completions/mean_length": 165.171875, + "completions/mean_terminated_length": 165.171875, + "completions/min_length": 41.75, + "completions/min_terminated_length": 41.75, + "epoch": 0.505, + "grad_norm": 10.476785659790039, + "kl": 24.03125, + "learning_rate": 1.1581580672544839e-05, + "loss": 1.783, + "num_tokens": 31594272.0, + "reward": 1.58203125, + "reward_std": 0.8544141203165054, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.422013059258461, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.4145498052239418, + "step": 1010, + "token_counts/after_target": 419.5, + "token_counts/after_think": 61.25, + "token_counts/before_target": 1326.0, + "token_counts/before_think": 836.0 + }, + { + "avg_penalty/after_target": 2.4920158088207245, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3710152134299278, + "avg_penalty/before_think": 0.36152517050504684, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 222.140625, + "completions/mean_terminated_length": 222.140625, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.5055, + "grad_norm": 6.477023601531982, + "kl": 19.1796875, + "learning_rate": 1.156434465040231e-05, + "loss": 1.5674, + "num_tokens": 31619865.0, + "reward": 1.640625, + "reward_std": 0.7074770778417587, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.37366948276758194, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.33573007583618164, + "step": 1011, + "token_counts/after_target": 713.0, + "token_counts/after_think": 51.0, + "token_counts/before_target": 1934.25, + "token_counts/before_think": 856.0 + }, + { + "avg_penalty/after_target": 2.293103873729706, + "avg_penalty/after_think": 3.5173206329345703, + "avg_penalty/before_target": 0.32891542091965675, + "avg_penalty/before_think": 0.40988077968358994, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.75, + "completions/max_terminated_length": 398.75, + "completions/mean_length": 149.515625, + "completions/mean_terminated_length": 149.515625, + "completions/min_length": 54.5, + "completions/min_terminated_length": 54.5, + "epoch": 0.506, + "grad_norm": 4.312129020690918, + "kl": 14.90625, + "learning_rate": 1.1547103862994683e-05, + "loss": 1.3212, + "num_tokens": 31640842.0, + "reward": 1.79296875, + "reward_std": 0.5986148715019226, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.31116948276758194, + "rewards/tag_count_reward/mean": 0.90234375, + "rewards/tag_count_reward/std": 0.2893451973795891, + "step": 1012, + "token_counts/after_target": 287.25, + "token_counts/after_think": 40.5, + "token_counts/before_target": 1190.25, + "token_counts/before_think": 874.25 + }, + { + "avg_penalty/after_target": 2.886269837617874, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.406447883695364, + "avg_penalty/before_think": 0.5582784935832024, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 905.25, + "completions/max_terminated_length": 476.75, + "completions/mean_length": 214.671875, + "completions/mean_terminated_length": 160.05908393859863, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.5065, + "grad_norm": 14.848457336425781, + "kl": 25.640625, + "learning_rate": 1.1529858362840383e-05, + "loss": 2.7254, + "num_tokens": 31664709.0, + "reward": 1.8046875, + "reward_std": 0.6361649632453918, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3186737820506096, + "rewards/tag_count_reward/mean": 0.8984375, + "rewards/tag_count_reward/std": 0.3012707233428955, + "step": 1013, + "token_counts/after_target": 1101.75, + "token_counts/after_think": 86.5, + "token_counts/before_target": 1696.75, + "token_counts/before_think": 549.75 + }, + { + "avg_penalty/after_target": 1.7822951972484589, + "avg_penalty/after_think": 3.7009005546569824, + "avg_penalty/before_target": 0.4073813408613205, + "avg_penalty/before_think": 0.4642046168446541, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 603.5, + "completions/max_terminated_length": 447.5, + "completions/mean_length": 157.765625, + "completions/mean_terminated_length": 143.54583549499512, + "completions/min_length": 40.5, + "completions/min_terminated_length": 40.5, + "epoch": 0.507, + "grad_norm": 5.380350112915039, + "kl": 22.2265625, + "learning_rate": 1.1512608202472195e-05, + "loss": 1.9054, + "num_tokens": 31686310.0, + "reward": 1.625, + "reward_std": 0.7496283203363419, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.3534414619207382, + "step": 1014, + "token_counts/after_target": 433.5, + "token_counts/after_think": 45.0, + "token_counts/before_target": 1415.0, + "token_counts/before_think": 630.75 + }, + { + "avg_penalty/after_target": 2.0502922534942627, + "avg_penalty/after_think": 2.6727136969566345, + "avg_penalty/before_target": 0.40802236646413803, + "avg_penalty/before_think": 0.5537505820393562, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.75, + "completions/max_terminated_length": 620.75, + "completions/mean_length": 200.203125, + "completions/mean_terminated_length": 200.203125, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.5075, + "grad_norm": 3.3456151485443115, + "kl": 20.140625, + "learning_rate": 1.1495353434437098e-05, + "loss": 1.6734, + "num_tokens": 31709187.0, + "reward": 1.66796875, + "reward_std": 0.6723240464925766, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.39656074345111847, + "rewards/tag_count_reward/mean": 0.87109375, + "rewards/tag_count_reward/std": 0.2955506071448326, + "step": 1015, + "token_counts/after_target": 534.5, + "token_counts/after_think": 77.75, + "token_counts/before_target": 1586.0, + "token_counts/before_think": 1005.0 + }, + { + "avg_penalty/after_target": 2.140233039855957, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3850085139274597, + "avg_penalty/before_think": 0.47277138382196426, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 253.28125, + "completions/mean_terminated_length": 253.28125, + "completions/min_length": 83.75, + "completions/min_terminated_length": 83.75, + "epoch": 0.508, + "grad_norm": 3.7390103340148926, + "kl": 12.109375, + "learning_rate": 1.1478094111296109e-05, + "loss": 1.1423, + "num_tokens": 31733493.0, + "reward": 1.73828125, + "reward_std": 0.5588664561510086, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.2979728877544403, + "rewards/tag_count_reward/mean": 0.87890625, + "rewards/tag_count_reward/std": 0.26349523663520813, + "step": 1016, + "token_counts/after_target": 616.0, + "token_counts/after_think": 68.0, + "token_counts/before_target": 2051.25, + "token_counts/before_think": 1317.25 + }, + { + "avg_penalty/after_target": 2.4156576693058014, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.4411395862698555, + "avg_penalty/before_think": 0.6546591594815254, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.5, + "completions/max_terminated_length": 611.5, + "completions/mean_length": 270.40625, + "completions/mean_terminated_length": 270.40625, + "completions/min_length": 43.25, + "completions/min_terminated_length": 43.25, + "epoch": 0.5085, + "grad_norm": 3.6665501594543457, + "kl": 16.109375, + "learning_rate": 1.1460830285624119e-05, + "loss": 1.452, + "num_tokens": 31760127.0, + "reward": 1.5078125, + "reward_std": 0.7781103253364563, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4229728877544403, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.3855060786008835, + "step": 1017, + "token_counts/after_target": 929.5, + "token_counts/after_think": 226.5, + "token_counts/before_target": 2252.5, + "token_counts/before_think": 918.0 + }, + { + "avg_penalty/after_target": 2.5074435770511627, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.40312909334897995, + "avg_penalty/before_think": 0.6261738836765289, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.75, + "completions/max_terminated_length": 553.75, + "completions/mean_length": 290.5, + "completions/mean_terminated_length": 290.5, + "completions/min_length": 68.25, + "completions/min_terminated_length": 68.25, + "epoch": 0.509, + "grad_norm": 3.7386722564697266, + "kl": 15.75, + "learning_rate": 1.1443562010009732e-05, + "loss": 1.4452, + "num_tokens": 31790431.0, + "reward": 1.49609375, + "reward_std": 0.7487521469593048, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.42516325414180756, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.3668944016098976, + "step": 1018, + "token_counts/after_target": 887.0, + "token_counts/after_think": 242.25, + "token_counts/before_target": 2314.0, + "token_counts/before_think": 1204.75 + }, + { + "avg_penalty/after_target": 2.521914690732956, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4549231231212616, + "avg_penalty/before_think": 0.6010920852422714, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 276.6875, + "completions/mean_terminated_length": 276.6875, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.5095, + "grad_norm": 6.921789646148682, + "kl": 18.765625, + "learning_rate": 1.1426289337055119e-05, + "loss": 1.4359, + "num_tokens": 31817259.0, + "reward": 1.35546875, + "reward_std": 0.856022298336029, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.47083858400583267, + "rewards/tag_count_reward/mean": 0.62109375, + "rewards/tag_count_reward/std": 0.39121896028518677, + "step": 1019, + "token_counts/after_target": 955.0, + "token_counts/after_think": 105.5, + "token_counts/before_target": 2603.0, + "token_counts/before_think": 763.5 + }, + { + "avg_penalty/after_target": 2.7517842650413513, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.46285923570394516, + "avg_penalty/before_think": 0.8071712255477905, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 718.0, + "completions/max_terminated_length": 692.25, + "completions/mean_length": 354.1875, + "completions/mean_terminated_length": 332.89063262939453, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.51, + "grad_norm": 4.20820951461792, + "kl": 14.90625, + "learning_rate": 1.1409012319375828e-05, + "loss": 1.4068, + "num_tokens": 31849271.0, + "reward": 1.44921875, + "reward_std": 0.7132754921913147, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4255262687802315, + "rewards/tag_count_reward/mean": 0.68359375, + "rewards/tag_count_reward/std": 0.38644056022167206, + "step": 1020, + "token_counts/after_target": 1646.5, + "token_counts/after_think": 152.75, + "token_counts/before_target": 2749.75, + "token_counts/before_think": 1118.0 + }, + { + "avg_penalty/after_target": 2.003502905368805, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5669670403003693, + "avg_penalty/before_think": 0.9555642157793045, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 833.5, + "completions/max_terminated_length": 833.5, + "completions/mean_length": 419.34375, + "completions/mean_terminated_length": 419.34375, + "completions/min_length": 136.75, + "completions/min_terminated_length": 136.75, + "epoch": 0.5105, + "grad_norm": 4.148513317108154, + "kl": 15.546875, + "learning_rate": 1.1391731009600655e-05, + "loss": 1.3858, + "num_tokens": 31885037.0, + "reward": 1.48828125, + "reward_std": 0.5789971277117729, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.35648179799318314, + "rewards/tag_count_reward/mean": 0.66015625, + "rewards/tag_count_reward/std": 0.318061962723732, + "step": 1021, + "token_counts/after_target": 1799.5, + "token_counts/after_think": 162.0, + "token_counts/before_target": 3241.75, + "token_counts/before_think": 1506.25 + }, + { + "avg_penalty/after_target": 2.207197457551956, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5367016494274139, + "avg_penalty/before_think": 0.8904537111520767, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 763.5, + "completions/mean_length": 459.203125, + "completions/mean_terminated_length": 421.5500183105469, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.511, + "grad_norm": 2.3058364391326904, + "kl": 16.6875, + "learning_rate": 1.1374445460371466e-05, + "loss": 1.4461, + "num_tokens": 31924138.0, + "reward": 1.28125, + "reward_std": 0.7526568472385406, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.47083858400583267, + "rewards/tag_count_reward/mean": 0.578125, + "rewards/tag_count_reward/std": 0.36161666363477707, + "step": 1022, + "token_counts/after_target": 2122.25, + "token_counts/after_think": 221.0, + "token_counts/before_target": 3152.25, + "token_counts/before_think": 1851.75 + }, + { + "avg_penalty/after_target": 2.239031195640564, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.6221555024385452, + "avg_penalty/before_think": 0.7755210697650909, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 887.5, + "completions/max_terminated_length": 811.25, + "completions/mean_length": 492.359375, + "completions/mean_terminated_length": 468.6760559082031, + "completions/min_length": 117.5, + "completions/min_terminated_length": 117.5, + "epoch": 0.5115, + "grad_norm": 5.025350570678711, + "kl": 20.015625, + "learning_rate": 1.1357155724343046e-05, + "loss": 1.5825, + "num_tokens": 31964257.0, + "reward": 1.2734375, + "reward_std": 0.7434429377317429, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.48025963455438614, + "rewards/tag_count_reward/mean": 0.6171875, + "rewards/tag_count_reward/std": 0.36375797539949417, + "step": 1023, + "token_counts/after_target": 2352.5, + "token_counts/after_think": 88.5, + "token_counts/before_target": 3555.25, + "token_counts/before_think": 1881.5 + }, + { + "avg_penalty/after_target": 2.3504163920879364, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.6260479390621185, + "avg_penalty/before_think": 0.7802955955266953, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 848.0, + "completions/max_terminated_length": 834.25, + "completions/mean_length": 420.671875, + "completions/mean_terminated_length": 410.57813262939453, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.512, + "grad_norm": 5.823105335235596, + "kl": 15.03125, + "learning_rate": 1.1339861854182923e-05, + "loss": 1.4501, + "num_tokens": 32001100.0, + "reward": 1.3515625, + "reward_std": 0.7310081124305725, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4462348371744156, + "rewards/tag_count_reward/mean": 0.5859375, + "rewards/tag_count_reward/std": 0.3482547104358673, + "step": 1024, + "token_counts/after_target": 1920.25, + "token_counts/after_think": 206.0, + "token_counts/before_target": 3108.5, + "token_counts/before_think": 1496.0 + }, + { + "avg_penalty/after_target": 2.153028756380081, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.6707229763269424, + "avg_penalty/before_think": 0.9578258544206619, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 981.25, + "completions/max_terminated_length": 870.0, + "completions/mean_length": 568.40625, + "completions/mean_terminated_length": 538.2318801879883, + "completions/min_length": 234.5, + "completions/min_terminated_length": 234.5, + "epoch": 0.5125, + "grad_norm": 6.481940269470215, + "kl": 13.609375, + "learning_rate": 1.1322563902571227e-05, + "loss": 1.3927, + "num_tokens": 32048150.0, + "reward": 1.3046875, + "reward_std": 0.641431525349617, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45247192680835724, + "rewards/tag_count_reward/mean": 0.5859375, + "rewards/tag_count_reward/std": 0.26782968267798424, + "step": 1025, + "token_counts/after_target": 2914.25, + "token_counts/after_think": 264.5, + "token_counts/before_target": 3739.75, + "token_counts/before_think": 2176.0 + }, + { + "avg_penalty/after_target": 2.378110110759735, + "avg_penalty/after_think": 3.736476480960846, + "avg_penalty/before_target": 0.5572921633720398, + "avg_penalty/before_think": 0.8369754552841187, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 896.0, + "completions/max_terminated_length": 711.0, + "completions/mean_length": 435.84375, + "completions/mean_terminated_length": 408.0729293823242, + "completions/min_length": 175.25, + "completions/min_terminated_length": 175.25, + "epoch": 0.513, + "grad_norm": 1.9173836708068848, + "kl": 16.171875, + "learning_rate": 1.130526192220052e-05, + "loss": 1.4258, + "num_tokens": 32085660.0, + "reward": 1.265625, + "reward_std": 0.6569287776947021, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.4255262687802315, + "rewards/tag_count_reward/mean": 0.59375, + "rewards/tag_count_reward/std": 0.30104294419288635, + "step": 1026, + "token_counts/after_target": 1931.75, + "token_counts/after_think": 237.0, + "token_counts/before_target": 3075.5, + "token_counts/before_think": 1729.25 + }, + { + "avg_penalty/after_target": 2.1051173508167267, + "avg_penalty/after_think": 2.8710527420043945, + "avg_penalty/before_target": 0.5316063165664673, + "avg_penalty/before_think": 0.7674980312585831, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 928.75, + "completions/max_terminated_length": 806.75, + "completions/mean_length": 451.6875, + "completions/mean_terminated_length": 433.70418548583984, + "completions/min_length": 134.5, + "completions/min_terminated_length": 134.5, + "epoch": 0.5135, + "grad_norm": 1.8726364374160767, + "kl": 14.359375, + "learning_rate": 1.128795596577563e-05, + "loss": 1.2961, + "num_tokens": 32126008.0, + "reward": 1.33984375, + "reward_std": 0.7629950046539307, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.14789126068353653, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.47360680997371674, + "rewards/tag_count_reward/mean": 0.60546875, + "rewards/tag_count_reward/std": 0.3269503340125084, + "step": 1027, + "token_counts/after_target": 1711.5, + "token_counts/after_think": 304.25, + "token_counts/before_target": 2927.75, + "token_counts/before_think": 2283.5 + }, + { + "avg_penalty/after_target": 2.086377114057541, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3492418974637985, + "avg_penalty/before_think": 0.6972441673278809, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 733.25, + "completions/max_terminated_length": 733.25, + "completions/mean_length": 422.46875, + "completions/mean_terminated_length": 422.46875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.514, + "grad_norm": 6.93661642074585, + "kl": 13.671875, + "learning_rate": 1.1270646086013507e-05, + "loss": 1.0268, + "num_tokens": 32162614.0, + "reward": 1.3125, + "reward_std": 0.7432514578104019, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.47669370472431183, + "rewards/tag_count_reward/mean": 0.625, + "rewards/tag_count_reward/std": 0.33853165060281754, + "step": 1028, + "token_counts/after_target": 1190.75, + "token_counts/after_think": 182.0, + "token_counts/before_target": 2888.0, + "token_counts/before_think": 2498.75 + }, + { + "avg_penalty/after_target": 2.2754384875297546, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4167233556509018, + "avg_penalty/before_think": 0.7663200199604034, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 635.25, + "completions/max_terminated_length": 635.25, + "completions/mean_length": 338.734375, + "completions/mean_terminated_length": 338.734375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.5145, + "grad_norm": 10.972956657409668, + "kl": 17.703125, + "learning_rate": 1.1253332335643043e-05, + "loss": 1.248, + "num_tokens": 32193845.0, + "reward": 1.06640625, + "reward_std": 0.8675564825534821, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.515625, + "rewards/format_reward/std": 0.5133601278066635, + "rewards/tag_count_reward/mean": 0.55078125, + "rewards/tag_count_reward/std": 0.39550816267728806, + "step": 1029, + "token_counts/after_target": 1207.5, + "token_counts/after_think": 64.5, + "token_counts/before_target": 2634.5, + "token_counts/before_think": 1513.25 + }, + { + "avg_penalty/after_target": 2.207944095134735, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5332057774066925, + "avg_penalty/before_think": 0.5815904885530472, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 746.25, + "completions/max_terminated_length": 746.25, + "completions/mean_length": 381.21875, + "completions/mean_terminated_length": 381.21875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.515, + "grad_norm": 4.988178730010986, + "kl": 16.046875, + "learning_rate": 1.1236014767404929e-05, + "loss": 1.3096, + "num_tokens": 32225779.0, + "reward": 1.1171875, + "reward_std": 0.7904378175735474, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.49776528775691986, + "rewards/tag_count_reward/mean": 0.6171875, + "rewards/tag_count_reward/std": 0.355602890253067, + "step": 1030, + "token_counts/after_target": 1139.75, + "token_counts/after_think": 452.75, + "token_counts/before_target": 2108.25, + "token_counts/before_think": 2398.75 + }, + { + "avg_penalty/after_target": 2.750702738761902, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.4338397681713104, + "avg_penalty/before_think": 0.5227107182145119, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.5, + "completions/max_terminated_length": 620.5, + "completions/mean_length": 345.0625, + "completions/mean_terminated_length": 345.0625, + "completions/min_length": 60.75, + "completions/min_terminated_length": 60.75, + "epoch": 0.5155, + "grad_norm": 9.21134090423584, + "kl": 16.203125, + "learning_rate": 1.1218693434051475e-05, + "loss": 1.1712, + "num_tokens": 32258743.0, + "reward": 1.10546875, + "reward_std": 0.8838090002536774, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.47360680997371674, + "rewards/tag_count_reward/mean": 0.55859375, + "rewards/tag_count_reward/std": 0.39548053592443466, + "step": 1031, + "token_counts/after_target": 1221.75, + "token_counts/after_think": 217.0, + "token_counts/before_target": 1926.5, + "token_counts/before_think": 2155.75 + }, + { + "avg_penalty/after_target": 2.584471642971039, + "avg_penalty/after_think": 2.9555625319480896, + "avg_penalty/before_target": 0.4256849139928818, + "avg_penalty/before_think": 0.6544828340411186, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 657.0, + "completions/max_terminated_length": 542.5, + "completions/mean_length": 322.875, + "completions/mean_terminated_length": 312.5562515258789, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.516, + "grad_norm": 7.687010288238525, + "kl": 16.984375, + "learning_rate": 1.1201368388346471e-05, + "loss": 1.264, + "num_tokens": 32287823.0, + "reward": 0.890625, + "reward_std": 0.8446621149778366, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.359375, + "rewards/format_reward/std": 0.4876555874943733, + "rewards/tag_count_reward/mean": 0.53125, + "rewards/tag_count_reward/std": 0.42330706119537354, + "step": 1032, + "token_counts/after_target": 1014.75, + "token_counts/after_think": 83.75, + "token_counts/before_target": 2049.5, + "token_counts/before_think": 2018.0 + }, + { + "avg_penalty/after_target": 2.3159550428390503, + "avg_penalty/after_think": 2.9056565165519714, + "avg_penalty/before_target": 0.4260985404253006, + "avg_penalty/before_think": 0.7005708664655685, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.5, + "completions/max_terminated_length": 646.5, + "completions/mean_length": 312.984375, + "completions/mean_terminated_length": 312.984375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.5165, + "grad_norm": 4.382868766784668, + "kl": 14.515625, + "learning_rate": 1.1184039683065014e-05, + "loss": 1.1736, + "num_tokens": 32317422.0, + "reward": 1.05078125, + "reward_std": 0.8461795896291733, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.421875, + "rewards/format_reward/std": 0.5049516260623932, + "rewards/tag_count_reward/mean": 0.62890625, + "rewards/tag_count_reward/std": 0.400371789932251, + "step": 1033, + "token_counts/after_target": 1005.25, + "token_counts/after_think": 64.75, + "token_counts/before_target": 2054.0, + "token_counts/before_think": 1883.75 + }, + { + "avg_penalty/after_target": 2.402832567691803, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.32213079184293747, + "avg_penalty/before_think": 0.6922260373830795, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 785.25, + "completions/max_terminated_length": 717.0, + "completions/mean_length": 412.8125, + "completions/mean_terminated_length": 403.3322982788086, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.517, + "grad_norm": 2.7831151485443115, + "kl": 13.625, + "learning_rate": 1.1166707370993333e-05, + "loss": 1.1373, + "num_tokens": 32354882.0, + "reward": 1.015625, + "reward_std": 0.9199317991733551, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.46875, + "rewards/format_reward/std": 0.5102732330560684, + "rewards/tag_count_reward/mean": 0.546875, + "rewards/tag_count_reward/std": 0.44543351978063583, + "step": 1034, + "token_counts/after_target": 1421.5, + "token_counts/after_think": 148.75, + "token_counts/before_target": 2949.75, + "token_counts/before_think": 2085.0 + }, + { + "avg_penalty/after_target": 2.7691319584846497, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3795447573065758, + "avg_penalty/before_think": 0.5233340635895729, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.25, + "completions/max_terminated_length": 637.25, + "completions/mean_length": 334.328125, + "completions/mean_terminated_length": 334.328125, + "completions/min_length": 111.75, + "completions/min_terminated_length": 111.75, + "epoch": 0.5175, + "grad_norm": 5.365878582000732, + "kl": 6.4921875, + "learning_rate": 1.1149371504928667e-05, + "loss": 0.7121, + "num_tokens": 32385591.0, + "reward": 1.21875, + "reward_std": 0.873881921172142, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.500852182507515, + "rewards/tag_count_reward/mean": 0.640625, + "rewards/tag_count_reward/std": 0.41667351871728897, + "step": 1035, + "token_counts/after_target": 1001.25, + "token_counts/after_think": 37.0, + "token_counts/before_target": 2175.5, + "token_counts/before_think": 2135.5 + }, + { + "avg_penalty/after_target": 2.186718374490738, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3426094278693199, + "avg_penalty/before_think": 0.7443515360355377, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 615.0, + "completions/max_terminated_length": 615.0, + "completions/mean_length": 359.0625, + "completions/mean_terminated_length": 359.0625, + "completions/min_length": 126.25, + "completions/min_terminated_length": 126.25, + "epoch": 0.518, + "grad_norm": 10.051237106323242, + "kl": 6.98046875, + "learning_rate": 1.113203213767907e-05, + "loss": 0.9348, + "num_tokens": 32421835.0, + "reward": 1.39453125, + "reward_std": 0.8777185678482056, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.11180340498685837, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.46449070423841476, + "rewards/tag_count_reward/mean": 0.67578125, + "rewards/tag_count_reward/std": 0.3872487396001816, + "step": 1036, + "token_counts/after_target": 1040.75, + "token_counts/after_think": 244.75, + "token_counts/before_target": 2230.25, + "token_counts/before_think": 2229.25 + }, + { + "avg_penalty/after_target": 2.4379009008407593, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.34557633846998215, + "avg_penalty/before_think": 0.5027211830019951, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.75, + "completions/max_terminated_length": 571.75, + "completions/mean_length": 353.828125, + "completions/mean_terminated_length": 353.828125, + "completions/min_length": 135.25, + "completions/min_terminated_length": 135.25, + "epoch": 0.5185, + "grad_norm": 4.053790092468262, + "kl": 9.359375, + "learning_rate": 1.1114689322063255e-05, + "loss": 0.8823, + "num_tokens": 32453984.0, + "reward": 1.05078125, + "reward_std": 0.9294790178537369, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.515625, + "rewards/format_reward/std": 0.5028772801160812, + "rewards/tag_count_reward/mean": 0.53515625, + "rewards/tag_count_reward/std": 0.4521555155515671, + "step": 1037, + "token_counts/after_target": 912.75, + "token_counts/after_think": 193.25, + "token_counts/before_target": 2558.5, + "token_counts/before_think": 1996.75 + }, + { + "avg_penalty/after_target": 2.04341459274292, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.38834525272250175, + "avg_penalty/before_think": 0.7913864701986313, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 659.25, + "completions/max_terminated_length": 659.25, + "completions/mean_length": 336.171875, + "completions/mean_terminated_length": 336.171875, + "completions/min_length": 106.5, + "completions/min_terminated_length": 106.5, + "epoch": 0.519, + "grad_norm": 5.709754943847656, + "kl": 9.0546875, + "learning_rate": 1.1097343110910452e-05, + "loss": 0.9566, + "num_tokens": 32485339.0, + "reward": 1.19921875, + "reward_std": 0.9837557226419449, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.5071863383054733, + "rewards/tag_count_reward/mean": 0.58984375, + "rewards/tag_count_reward/std": 0.45590148121118546, + "step": 1038, + "token_counts/after_target": 971.25, + "token_counts/after_think": 116.0, + "token_counts/before_target": 2294.25, + "token_counts/before_think": 1997.25 + }, + { + "avg_penalty/after_target": 2.428991198539734, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3587685003876686, + "avg_penalty/before_think": 0.5868790969252586, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 588.5, + "completions/max_terminated_length": 588.5, + "completions/mean_length": 299.484375, + "completions/mean_terminated_length": 299.484375, + "completions/min_length": 114.75, + "completions/min_terminated_length": 114.75, + "epoch": 0.5195, + "grad_norm": 7.932430744171143, + "kl": 6.6640625, + "learning_rate": 1.1079993557060228e-05, + "loss": 0.8143, + "num_tokens": 32518874.0, + "reward": 1.29296875, + "reward_std": 0.8423445373773575, + "rewards/accuracy_reward/mean": NaN, + "rewards/accuracy_reward/std": NaN, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.4697679653763771, + "rewards/tag_count_reward/mean": 0.63671875, + "rewards/tag_count_reward/std": 0.3981419801712036, + "step": 1039, + "token_counts/after_target": 829.5, + "token_counts/after_think": 154.25, + "token_counts/before_target": 2093.0, + "token_counts/before_think": 1715.0 + }, + { + "avg_penalty/after_target": 2.2070354521274567, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.322839692234993, + "avg_penalty/before_think": 0.6441878974437714, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.75, + "completions/max_terminated_length": 482.75, + "completions/mean_length": 301.1875, + "completions/mean_terminated_length": 301.1875, + "completions/min_length": 74.25, + "completions/min_terminated_length": 74.25, + "epoch": 0.52, + "grad_norm": 8.002808570861816, + "kl": 6.5234375, + "learning_rate": 1.1062640713362333e-05, + "loss": 0.8232, + "num_tokens": 32546278.0, + "reward": 1.33203125, + "reward_std": 0.881516844034195, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.47354350984096527, + "rewards/tag_count_reward/mean": 0.66015625, + "rewards/tag_count_reward/std": 0.4259575605392456, + "step": 1040, + "token_counts/after_target": 566.5, + "token_counts/after_think": 385.0, + "token_counts/before_target": 2116.0, + "token_counts/before_think": 1751.5 + }, + { + "avg_penalty/after_target": 2.453754425048828, + "avg_penalty/after_think": 2.7351613640785217, + "avg_penalty/before_target": 0.44042300805449486, + "avg_penalty/before_think": 0.6802855581045151, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 818.0, + "completions/max_terminated_length": 526.5, + "completions/mean_length": 298.21875, + "completions/mean_terminated_length": 274.62396240234375, + "completions/min_length": 52.25, + "completions/min_terminated_length": 52.25, + "epoch": 0.5205, + "grad_norm": 5.100199222564697, + "kl": 10.6875, + "learning_rate": 1.1045284632676535e-05, + "loss": 1.1312, + "num_tokens": 32574564.0, + "reward": 1.1953125, + "reward_std": 0.9376004487276077, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.50393907725811, + "rewards/tag_count_reward/mean": 0.6015625, + "rewards/tag_count_reward/std": 0.4523979127407074, + "step": 1041, + "token_counts/after_target": 945.0, + "token_counts/after_think": 116.75, + "token_counts/before_target": 2534.25, + "token_counts/before_think": 1175.5 + }, + { + "avg_penalty/after_target": 2.018095523118973, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.37469425797462463, + "avg_penalty/before_think": 0.519971638917923, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.5, + "completions/max_terminated_length": 534.5, + "completions/mean_length": 279.609375, + "completions/mean_terminated_length": 279.609375, + "completions/min_length": 46.75, + "completions/min_terminated_length": 46.75, + "epoch": 0.521, + "grad_norm": 6.4037346839904785, + "kl": 14.828125, + "learning_rate": 1.102792536787247e-05, + "loss": 1.0405, + "num_tokens": 32602139.0, + "reward": 1.015625, + "reward_std": 0.9784199297428131, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.515625, + "rewards/format_reward/std": 0.5112857818603516, + "rewards/tag_count_reward/mean": 0.484375, + "rewards/tag_count_reward/std": 0.4561750441789627, + "step": 1042, + "token_counts/after_target": 663.5, + "token_counts/after_think": 237.75, + "token_counts/before_target": 2196.5, + "token_counts/before_think": 1376.0 + }, + { + "avg_penalty/after_target": 2.5525640845298767, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.25645193830132484, + "avg_penalty/before_think": 0.5808303579688072, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.0, + "completions/max_terminated_length": 521.0, + "completions/mean_length": 279.375, + "completions/mean_terminated_length": 279.375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.5215, + "grad_norm": 1.8845096826553345, + "kl": 12.3125, + "learning_rate": 1.1010562971829464e-05, + "loss": 1.0604, + "num_tokens": 32631155.0, + "reward": 1.34375, + "reward_std": 0.9882455468177795, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.14789126068353653, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.48935678601264954, + "rewards/tag_count_reward/mean": 0.640625, + "rewards/tag_count_reward/std": 0.45209622383117676, + "step": 1043, + "token_counts/after_target": 578.0, + "token_counts/after_think": 266.5, + "token_counts/before_target": 2199.25, + "token_counts/before_think": 1426.25 + }, + { + "avg_penalty/after_target": 2.4573631584644318, + "avg_penalty/after_think": 2.340366005897522, + "avg_penalty/before_target": 0.40461941063404083, + "avg_penalty/before_think": 0.6882390677928925, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.75, + "completions/max_terminated_length": 573.75, + "completions/mean_length": 286.015625, + "completions/mean_terminated_length": 286.015625, + "completions/min_length": 43.5, + "completions/min_terminated_length": 43.5, + "epoch": 0.522, + "grad_norm": 2.9641077518463135, + "kl": 15.015625, + "learning_rate": 1.0993197497436392e-05, + "loss": 1.1864, + "num_tokens": 32661588.0, + "reward": 1.15234375, + "reward_std": 0.9445421695709229, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.5049516260623932, + "rewards/tag_count_reward/mean": 0.57421875, + "rewards/tag_count_reward/std": 0.46261872351169586, + "step": 1044, + "token_counts/after_target": 942.75, + "token_counts/after_think": 76.25, + "token_counts/before_target": 2162.0, + "token_counts/before_think": 1395.25 + }, + { + "avg_penalty/after_target": 2.639480710029602, + "avg_penalty/after_think": 1.8199257850646973, + "avg_penalty/before_target": 0.3799167647957802, + "avg_penalty/before_think": 0.5453528538346291, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.75, + "completions/max_terminated_length": 634.75, + "completions/mean_length": 293.25, + "completions/mean_terminated_length": 293.25, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.5225, + "grad_norm": 11.758684158325195, + "kl": 24.5, + "learning_rate": 1.0975828997591496e-05, + "loss": 1.7526, + "num_tokens": 32688804.0, + "reward": 0.99609375, + "reward_std": 0.9549857974052429, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.46875, + "rewards/format_reward/std": 0.5040994435548782, + "rewards/tag_count_reward/mean": 0.51171875, + "rewards/tag_count_reward/std": 0.4728490188717842, + "step": 1045, + "token_counts/after_target": 1108.75, + "token_counts/after_think": 97.75, + "token_counts/before_target": 2214.25, + "token_counts/before_think": 1271.25 + }, + { + "avg_penalty/after_target": 3.221062183380127, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.34538882225751877, + "avg_penalty/before_think": 0.3841341622173786, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 619.5, + "completions/max_terminated_length": 619.5, + "completions/mean_length": 235.375, + "completions/mean_terminated_length": 235.375, + "completions/min_length": 20.75, + "completions/min_terminated_length": 20.75, + "epoch": 0.523, + "grad_norm": 3.991199254989624, + "kl": 19.15625, + "learning_rate": 1.0958457525202241e-05, + "loss": 1.4369, + "num_tokens": 32713068.0, + "reward": 1.14453125, + "reward_std": 0.9301393628120422, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.5030868947505951, + "rewards/tag_count_reward/mean": 0.56640625, + "rewards/tag_count_reward/std": 0.4456104189157486, + "step": 1046, + "token_counts/after_target": 654.25, + "token_counts/after_think": 124.25, + "token_counts/before_target": 1770.25, + "token_counts/before_think": 1217.25 + }, + { + "avg_penalty/after_target": 2.706880211830139, + "avg_penalty/after_think": 3.7960805892944336, + "avg_penalty/before_target": 0.41087624058127403, + "avg_penalty/before_think": 0.5122971758246422, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.5, + "completions/max_terminated_length": 605.5, + "completions/mean_length": 292.328125, + "completions/mean_terminated_length": 292.328125, + "completions/min_length": 81.75, + "completions/min_terminated_length": 81.75, + "epoch": 0.5235, + "grad_norm": 6.810196399688721, + "kl": 20.28125, + "learning_rate": 1.0941083133185146e-05, + "loss": 1.4736, + "num_tokens": 32741393.0, + "reward": 1.2265625, + "reward_std": 0.9237359315156937, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.49776528775691986, + "rewards/tag_count_reward/mean": 0.6015625, + "rewards/tag_count_reward/std": 0.4481533095240593, + "step": 1047, + "token_counts/after_target": 913.25, + "token_counts/after_think": 49.75, + "token_counts/before_target": 2526.25, + "token_counts/before_think": 1188.0 + }, + { + "avg_penalty/after_target": 2.561535120010376, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3798987530171871, + "avg_penalty/before_think": 0.5926431119441986, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 663.75, + "completions/max_terminated_length": 663.75, + "completions/mean_length": 288.1875, + "completions/mean_terminated_length": 288.1875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.524, + "grad_norm": 5.311906337738037, + "kl": 18.203125, + "learning_rate": 1.0923705874465617e-05, + "loss": 1.4478, + "num_tokens": 32768557.0, + "reward": 1.46875, + "reward_std": 0.9752759337425232, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.12909944355487823, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4713720977306366, + "rewards/tag_count_reward/mean": 0.65625, + "rewards/tag_count_reward/std": 0.41869857907295227, + "step": 1048, + "token_counts/after_target": 765.5, + "token_counts/after_think": 101.25, + "token_counts/before_target": 2549.75, + "token_counts/before_think": 1194.5 + }, + { + "avg_penalty/after_target": 2.154070734977722, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.41750939190387726, + "avg_penalty/before_think": 0.7192282378673553, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 745.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 312.140625, + "completions/mean_terminated_length": 312.140625, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.5245, + "grad_norm": 2.297111749649048, + "kl": 17.78125, + "learning_rate": 1.0906325801977804e-05, + "loss": 1.5618, + "num_tokens": 32801110.0, + "reward": 1.41796875, + "reward_std": 0.8480815142393112, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4471946656703949, + "rewards/tag_count_reward/mean": 0.68359375, + "rewards/tag_count_reward/std": 0.39657578617334366, + "step": 1049, + "token_counts/after_target": 994.25, + "token_counts/after_think": 110.0, + "token_counts/before_target": 2963.0, + "token_counts/before_think": 927.0 + }, + { + "avg_penalty/after_target": 3.006866991519928, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.2791532017290592, + "avg_penalty/before_think": 0.49011562019586563, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 251.921875, + "completions/mean_terminated_length": 251.921875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.525, + "grad_norm": 2.134791612625122, + "kl": 14.484375, + "learning_rate": 1.0888942968664417e-05, + "loss": 1.2176, + "num_tokens": 32827729.0, + "reward": 1.4375, + "reward_std": 0.9429866522550583, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.11180340498685837, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.47354350984096527, + "rewards/tag_count_reward/mean": 0.703125, + "rewards/tag_count_reward/std": 0.42193443328142166, + "step": 1050, + "token_counts/after_target": 389.0, + "token_counts/after_think": 215.0, + "token_counts/before_target": 2012.75, + "token_counts/before_think": 1414.0 + }, + { + "avg_penalty/after_target": 2.6805565655231476, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3052711859345436, + "avg_penalty/before_think": 0.5174956545233727, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 606.0, + "completions/max_terminated_length": 606.0, + "completions/mean_length": 253.0625, + "completions/mean_terminated_length": 253.0625, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.5255, + "grad_norm": 8.89513874053955, + "kl": 14.015625, + "learning_rate": 1.0871557427476585e-05, + "loss": 1.4262, + "num_tokens": 32854341.0, + "reward": 1.44921875, + "reward_std": 0.8556235432624817, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.45508860796689987, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.4137125387787819, + "step": 1051, + "token_counts/after_target": 723.5, + "token_counts/after_think": 104.25, + "token_counts/before_target": 1978.75, + "token_counts/before_think": 1242.5 + }, + { + "avg_penalty/after_target": 2.2337734699249268, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5081320255994797, + "avg_penalty/before_think": 0.5010787099599838, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 610.0, + "completions/max_terminated_length": 610.0, + "completions/mean_length": 274.484375, + "completions/mean_terminated_length": 274.484375, + "completions/min_length": 78.25, + "completions/min_terminated_length": 78.25, + "epoch": 0.526, + "grad_norm": 4.095811367034912, + "kl": 16.296875, + "learning_rate": 1.0854169231373677e-05, + "loss": 1.4836, + "num_tokens": 32880020.0, + "reward": 1.41796875, + "reward_std": 0.8695410788059235, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4682852029800415, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.4201314076781273, + "step": 1052, + "token_counts/after_target": 1020.25, + "token_counts/after_think": 28.25, + "token_counts/before_target": 2173.75, + "token_counts/before_think": 1169.5 + }, + { + "avg_penalty/after_target": 2.583633154630661, + "avg_penalty/after_think": 2.7153822779655457, + "avg_penalty/before_target": 0.370065800845623, + "avg_penalty/before_think": 0.4094550721347332, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.5, + "completions/max_terminated_length": 489.5, + "completions/mean_length": 208.96875, + "completions/mean_terminated_length": 208.96875, + "completions/min_length": 65.5, + "completions/min_terminated_length": 65.5, + "epoch": 0.5265, + "grad_norm": 6.8410725593566895, + "kl": 12.828125, + "learning_rate": 1.083677843332316e-05, + "loss": 1.277, + "num_tokens": 32904562.0, + "reward": 1.5546875, + "reward_std": 0.7852504998445511, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.43303824216127396, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.37109920382499695, + "step": 1053, + "token_counts/after_target": 528.25, + "token_counts/after_think": 34.25, + "token_counts/before_target": 1672.5, + "token_counts/before_think": 1108.5 + }, + { + "avg_penalty/after_target": 2.566372573375702, + "avg_penalty/after_think": 2.9582483172416687, + "avg_penalty/before_target": 0.5427545011043549, + "avg_penalty/before_think": 0.4676695168018341, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 792.0, + "completions/max_terminated_length": 659.5, + "completions/mean_length": 263.765625, + "completions/mean_terminated_length": 251.91354370117188, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.527, + "grad_norm": 10.511590957641602, + "kl": 19.6875, + "learning_rate": 1.0819385086300412e-05, + "loss": 2.0281, + "num_tokens": 32930627.0, + "reward": 1.48046875, + "reward_std": 0.8200548887252808, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.38358907401561737, + "step": 1054, + "token_counts/after_target": 986.0, + "token_counts/after_think": 68.0, + "token_counts/before_target": 1978.5, + "token_counts/before_think": 1187.75 + }, + { + "avg_penalty/after_target": 2.4719693064689636, + "avg_penalty/after_think": 2.9072502851486206, + "avg_penalty/before_target": 0.4818997085094452, + "avg_penalty/before_think": 0.6758421212434769, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 746.5, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 259.078125, + "completions/mean_terminated_length": 247.32708740234375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.5275, + "grad_norm": 8.242751121520996, + "kl": 17.0546875, + "learning_rate": 1.0801989243288588e-05, + "loss": 1.805, + "num_tokens": 32962584.0, + "reward": 1.48046875, + "reward_std": 0.7587194293737411, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4260597825050354, + "rewards/tag_count_reward/mean": 0.73046875, + "rewards/tag_count_reward/std": 0.35147490352392197, + "step": 1055, + "token_counts/after_target": 922.0, + "token_counts/after_think": 219.5, + "token_counts/before_target": 2119.25, + "token_counts/before_think": 884.5 + }, + { + "avg_penalty/after_target": 3.089338481426239, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.32828017696738243, + "avg_penalty/before_think": 0.5451310351490974, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 244.890625, + "completions/mean_terminated_length": 244.890625, + "completions/min_length": 28.5, + "completions/min_terminated_length": 28.5, + "epoch": 0.528, + "grad_norm": 2.493723154067993, + "kl": 16.921875, + "learning_rate": 1.0784590957278452e-05, + "loss": 1.5139, + "num_tokens": 32990481.0, + "reward": 1.51953125, + "reward_std": 0.7894981652498245, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.41898179799318314, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.3873853459954262, + "step": 1056, + "token_counts/after_target": 589.5, + "token_counts/after_think": 170.25, + "token_counts/before_target": 2047.75, + "token_counts/before_think": 1110.75 + }, + { + "avg_penalty/after_target": 2.5082192718982697, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3939966708421707, + "avg_penalty/before_think": 0.41242408752441406, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.5, + "completions/max_terminated_length": 543.5, + "completions/mean_length": 220.734375, + "completions/mean_terminated_length": 220.734375, + "completions/min_length": 48.75, + "completions/min_terminated_length": 48.75, + "epoch": 0.5285, + "grad_norm": 5.719353199005127, + "kl": 16.6328125, + "learning_rate": 1.0767190281268187e-05, + "loss": 1.6223, + "num_tokens": 33012544.0, + "reward": 1.484375, + "reward_std": 0.8268771767616272, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.44974804669618607, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.3968922048807144, + "step": 1057, + "token_counts/after_target": 576.75, + "token_counts/after_think": 157.25, + "token_counts/before_target": 1865.5, + "token_counts/before_think": 932.25 + }, + { + "avg_penalty/after_target": 2.0264768600463867, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.45676034688949585, + "avg_penalty/before_think": 0.5324059650301933, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 783.0, + "completions/max_terminated_length": 711.75, + "completions/mean_length": 299.0625, + "completions/mean_terminated_length": 287.6156311035156, + "completions/min_length": 48.25, + "completions/min_terminated_length": 48.25, + "epoch": 0.529, + "grad_norm": 8.896907806396484, + "kl": 22.765625, + "learning_rate": 1.0749787268263279e-05, + "loss": 1.6955, + "num_tokens": 33043860.0, + "reward": 1.4140625, + "reward_std": 0.8635172098875046, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.6953125, + "rewards/tag_count_reward/std": 0.4190337136387825, + "step": 1058, + "token_counts/after_target": 964.0, + "token_counts/after_think": 43.75, + "token_counts/before_target": 2655.5, + "token_counts/before_think": 1121.75 + }, + { + "avg_penalty/after_target": 2.2906574606895447, + "avg_penalty/after_think": 3.9142937064170837, + "avg_penalty/before_target": 0.25062262266874313, + "avg_penalty/before_think": 0.5504142642021179, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.5, + "completions/max_terminated_length": 577.5, + "completions/mean_length": 231.203125, + "completions/mean_terminated_length": 231.203125, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.5295, + "grad_norm": 8.24549388885498, + "kl": 19.5625, + "learning_rate": 1.0732381971276318e-05, + "loss": 1.4589, + "num_tokens": 33067777.0, + "reward": 1.703125, + "reward_std": 0.8702612817287445, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.1280868947505951, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4097762927412987, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.38321947306394577, + "step": 1059, + "token_counts/after_target": 270.75, + "token_counts/after_think": 96.5, + "token_counts/before_target": 2118.0, + "token_counts/before_think": 1214.0 + }, + { + "avg_penalty/after_target": 3.3055636286735535, + "avg_penalty/after_think": 2.8377551436424255, + "avg_penalty/before_target": 0.4909198544919491, + "avg_penalty/before_think": 0.5116158276796341, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 689.0, + "completions/max_terminated_length": 636.5, + "completions/mean_length": 222.453125, + "completions/mean_terminated_length": 211.06354522705078, + "completions/min_length": 49.5, + "completions/min_terminated_length": 49.5, + "epoch": 0.53, + "grad_norm": 4.6860246658325195, + "kl": 20.921875, + "learning_rate": 1.071497444332686e-05, + "loss": 1.971, + "num_tokens": 33091550.0, + "reward": 1.6484375, + "reward_std": 0.6857802867889404, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.30025141686201096, + "step": 1060, + "token_counts/after_target": 709.75, + "token_counts/after_think": 114.0, + "token_counts/before_target": 1556.75, + "token_counts/before_think": 1178.75 + }, + { + "avg_penalty/after_target": 3.316552221775055, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.4533049985766411, + "avg_penalty/before_think": 0.39651302248239517, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 638.0, + "completions/max_terminated_length": 475.5, + "completions/mean_length": 217.96875, + "completions/mean_terminated_length": 204.23854446411133, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.5305, + "grad_norm": 5.934437274932861, + "kl": 29.15625, + "learning_rate": 1.0697564737441254e-05, + "loss": 2.2932, + "num_tokens": 33114556.0, + "reward": 1.38671875, + "reward_std": 0.8677244782447815, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.48456869274377823, + "rewards/tag_count_reward/mean": 0.73046875, + "rewards/tag_count_reward/std": 0.3982677310705185, + "step": 1061, + "token_counts/after_target": 823.5, + "token_counts/after_think": 84.5, + "token_counts/before_target": 1662.0, + "token_counts/before_think": 917.5 + }, + { + "avg_penalty/after_target": 1.7185055017471313, + "avg_penalty/after_think": 3.8028175830841064, + "avg_penalty/before_target": 0.27439240366220474, + "avg_penalty/before_think": 0.6112039685249329, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 192.125, + "completions/mean_terminated_length": 192.125, + "completions/min_length": 46.5, + "completions/min_terminated_length": 46.5, + "epoch": 0.531, + "grad_norm": 19.68060302734375, + "kl": 28.6875, + "learning_rate": 1.0680152906652483e-05, + "loss": 1.8272, + "num_tokens": 33135268.0, + "reward": 1.296875, + "reward_std": 0.8835745304822922, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.500852182507515, + "rewards/tag_count_reward/mean": 0.6875, + "rewards/tag_count_reward/std": 0.4023250639438629, + "step": 1062, + "token_counts/after_target": 285.5, + "token_counts/after_think": 147.25, + "token_counts/before_target": 1854.0, + "token_counts/before_think": 787.25 + }, + { + "avg_penalty/after_target": 2.350041478872299, + "avg_penalty/after_think": 2.852053463459015, + "avg_penalty/before_target": 0.478169534355402, + "avg_penalty/before_think": 0.6730654537677765, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 666.25, + "completions/max_terminated_length": 620.75, + "completions/mean_length": 283.5625, + "completions/mean_terminated_length": 272.97708892822266, + "completions/min_length": 63.75, + "completions/min_terminated_length": 63.75, + "epoch": 0.5315, + "grad_norm": 7.419860363006592, + "kl": 21.171875, + "learning_rate": 1.0662739004000005e-05, + "loss": 1.6662, + "num_tokens": 33163768.0, + "reward": 1.453125, + "reward_std": 0.8234775811433792, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4761601909995079, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.37054119259119034, + "step": 1063, + "token_counts/after_target": 915.5, + "token_counts/after_think": 150.75, + "token_counts/before_target": 2040.0, + "token_counts/before_think": 1430.75 + }, + { + "avg_penalty/after_target": 2.1544883847236633, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3446805402636528, + "avg_penalty/before_think": 0.7148061245679855, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 587.5, + "completions/max_terminated_length": 587.5, + "completions/mean_length": 209.9375, + "completions/mean_terminated_length": 209.9375, + "completions/min_length": 39.5, + "completions/min_terminated_length": 39.5, + "epoch": 0.532, + "grad_norm": 5.192633152008057, + "kl": 17.40625, + "learning_rate": 1.0645323082529582e-05, + "loss": 1.3597, + "num_tokens": 33188052.0, + "reward": 1.53515625, + "reward_std": 0.9177407324314117, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.1280868947505951, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.47354350984096527, + "rewards/tag_count_reward/mean": 0.75390625, + "rewards/tag_count_reward/std": 0.3807477802038193, + "step": 1064, + "token_counts/after_target": 434.5, + "token_counts/after_think": 54.75, + "token_counts/before_target": 1769.25, + "token_counts/before_think": 1100.5 + }, + { + "avg_penalty/after_target": 1.9000736474990845, + "avg_penalty/after_think": 3.6677013635635376, + "avg_penalty/before_target": 0.5028962790966034, + "avg_penalty/before_think": 0.5459076836705208, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 819.25, + "completions/max_terminated_length": 742.25, + "completions/mean_length": 312.703125, + "completions/mean_terminated_length": 302.5562515258789, + "completions/min_length": 87.5, + "completions/min_terminated_length": 87.5, + "epoch": 0.5325, + "grad_norm": 5.618954181671143, + "kl": 22.8125, + "learning_rate": 1.0627905195293135e-05, + "loss": 1.782, + "num_tokens": 33220033.0, + "reward": 1.39453125, + "reward_std": 0.8808415532112122, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4613594636321068, + "rewards/tag_count_reward/mean": 0.70703125, + "rewards/tag_count_reward/std": 0.4263843819499016, + "step": 1065, + "token_counts/after_target": 772.5, + "token_counts/after_think": 354.5, + "token_counts/before_target": 2166.25, + "token_counts/before_think": 1710.0 + }, + { + "avg_penalty/after_target": 2.044517934322357, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5785723328590393, + "avg_penalty/before_think": 0.507897399365902, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 674.75, + "completions/max_terminated_length": 544.25, + "completions/mean_length": 284.53125, + "completions/mean_terminated_length": 272.2760429382324, + "completions/min_length": 83.75, + "completions/min_terminated_length": 83.75, + "epoch": 0.533, + "grad_norm": 5.270618438720703, + "kl": 21.75, + "learning_rate": 1.0610485395348571e-05, + "loss": 1.934, + "num_tokens": 33248243.0, + "reward": 1.40234375, + "reward_std": 0.8725578784942627, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.48148179799318314, + "rewards/tag_count_reward/mean": 0.73046875, + "rewards/tag_count_reward/std": 0.4061018377542496, + "step": 1066, + "token_counts/after_target": 998.25, + "token_counts/after_think": 300.5, + "token_counts/before_target": 2034.75, + "token_counts/before_think": 1219.0 + }, + { + "avg_penalty/after_target": 2.129112035036087, + "avg_penalty/after_think": 1.904992401599884, + "avg_penalty/before_target": 0.4755696579813957, + "avg_penalty/before_think": 0.4517533853650093, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 805.0, + "completions/max_terminated_length": 635.0, + "completions/mean_length": 332.484375, + "completions/mean_terminated_length": 288.60208892822266, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.5335, + "grad_norm": 2.4517242908477783, + "kl": 23.640625, + "learning_rate": 1.0593063735759619e-05, + "loss": 1.9279, + "num_tokens": 33278834.0, + "reward": 1.28515625, + "reward_std": 0.9031875282526016, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.49297719448804855, + "rewards/tag_count_reward/mean": 0.66015625, + "rewards/tag_count_reward/std": 0.42376185953617096, + "step": 1067, + "token_counts/after_target": 1187.5, + "token_counts/after_think": 27.75, + "token_counts/before_target": 2814.25, + "token_counts/before_think": 1290.25 + }, + { + "avg_penalty/after_target": 2.424347400665283, + "avg_penalty/after_think": 3.9837191104888916, + "avg_penalty/before_target": 0.29849158227443695, + "avg_penalty/before_think": 0.5395459085702896, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.0, + "completions/max_terminated_length": 569.0, + "completions/mean_length": 253.125, + "completions/mean_terminated_length": 253.125, + "completions/min_length": 85.25, + "completions/min_terminated_length": 85.25, + "epoch": 0.534, + "grad_norm": 6.493127822875977, + "kl": 12.265625, + "learning_rate": 1.0575640269595675e-05, + "loss": 1.2737, + "num_tokens": 33303530.0, + "reward": 1.6171875, + "reward_std": 0.7843188643455505, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4057852029800415, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.3475511968135834, + "step": 1068, + "token_counts/after_target": 642.5, + "token_counts/after_think": 188.75, + "token_counts/before_target": 1817.0, + "token_counts/before_think": 1401.75 + }, + { + "avg_penalty/after_target": 1.857883244752884, + "avg_penalty/after_think": 3.914553165435791, + "avg_penalty/before_target": 0.4931022524833679, + "avg_penalty/before_think": 0.7438937574625015, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 798.75, + "completions/max_terminated_length": 756.5, + "completions/mean_length": 346.75, + "completions/mean_terminated_length": 335.7510452270508, + "completions/min_length": 121.25, + "completions/min_terminated_length": 121.25, + "epoch": 0.5345, + "grad_norm": 4.510323524475098, + "kl": 20.203125, + "learning_rate": 1.055821504993164e-05, + "loss": 1.7958, + "num_tokens": 33333066.0, + "reward": 1.296875, + "reward_std": 0.9140615165233612, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.48296456038951874, + "rewards/tag_count_reward/mean": 0.671875, + "rewards/tag_count_reward/std": 0.4422926679253578, + "step": 1069, + "token_counts/after_target": 1050.5, + "token_counts/after_think": 426.25, + "token_counts/before_target": 2658.75, + "token_counts/before_think": 1412.5 + }, + { + "avg_penalty/after_target": 2.1192080080509186, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.33708615228533745, + "avg_penalty/before_think": 0.6331523358821869, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.25, + "completions/max_terminated_length": 757.25, + "completions/mean_length": 328.265625, + "completions/mean_terminated_length": 328.265625, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.535, + "grad_norm": 5.166783809661865, + "kl": 12.1953125, + "learning_rate": 1.0540788129847757e-05, + "loss": 1.2562, + "num_tokens": 33366635.0, + "reward": 1.48046875, + "reward_std": 0.8008508235216141, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.43494731932878494, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.3822256475687027, + "step": 1070, + "token_counts/after_target": 612.5, + "token_counts/after_think": 274.5, + "token_counts/before_target": 1953.5, + "token_counts/before_think": 2411.75 + }, + { + "avg_penalty/after_target": 2.4779555201530457, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.6942021697759628, + "avg_penalty/before_think": 0.711614228785038, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 1015.0, + "completions/max_terminated_length": 822.75, + "completions/mean_length": 438.40625, + "completions/mean_terminated_length": 392.5470428466797, + "completions/min_length": 91.5, + "completions/min_terminated_length": 91.5, + "epoch": 0.5355, + "grad_norm": 4.0409159660339355, + "kl": 27.65625, + "learning_rate": 1.0523359562429441e-05, + "loss": 2.4436, + "num_tokens": 33407557.0, + "reward": 0.9140625, + "reward_std": 0.904589518904686, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.40625, + "rewards/format_reward/std": 0.48680340498685837, + "rewards/tag_count_reward/mean": 0.5078125, + "rewards/tag_count_reward/std": 0.4390285834670067, + "step": 1071, + "token_counts/after_target": 2461.25, + "token_counts/after_think": 141.0, + "token_counts/before_target": 3613.0, + "token_counts/before_think": 799.25 + }, + { + "avg_penalty/after_target": 1.8754038512706757, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5045553296804428, + "avg_penalty/before_think": 0.6819572597742081, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 776.25, + "completions/max_terminated_length": 672.25, + "completions/mean_length": 354.265625, + "completions/mean_terminated_length": 343.61771392822266, + "completions/min_length": 42.5, + "completions/min_terminated_length": 42.5, + "epoch": 0.536, + "grad_norm": 3.225289821624756, + "kl": 23.40625, + "learning_rate": 1.0505929400767134e-05, + "loss": 1.8976, + "num_tokens": 33440406.0, + "reward": 0.9375, + "reward_std": 0.899106964468956, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.49297719448804855, + "rewards/tag_count_reward/mean": 0.53125, + "rewards/tag_count_reward/std": 0.44504012167453766, + "step": 1072, + "token_counts/after_target": 1265.25, + "token_counts/after_think": 364.5, + "token_counts/before_target": 2851.0, + "token_counts/before_think": 1187.5 + }, + { + "avg_penalty/after_target": 1.9004035294055939, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3941604271531105, + "avg_penalty/before_think": 0.6343836486339569, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 711.75, + "completions/max_terminated_length": 711.75, + "completions/mean_length": 323.5625, + "completions/mean_terminated_length": 323.5625, + "completions/min_length": 89.75, + "completions/min_terminated_length": 89.75, + "epoch": 0.5365, + "grad_norm": 4.0903544425964355, + "kl": 16.09375, + "learning_rate": 1.0488497697956134e-05, + "loss": 1.4675, + "num_tokens": 33470074.0, + "reward": 1.3359375, + "reward_std": 0.8522044569253922, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.4939897432923317, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.39265119284391403, + "step": 1073, + "token_counts/after_target": 750.25, + "token_counts/after_think": 111.75, + "token_counts/before_target": 2044.25, + "token_counts/before_think": 2270.75 + }, + { + "avg_penalty/after_target": 2.2214550226926804, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.5392792075872421, + "avg_penalty/before_think": 0.62164506316185, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 892.5, + "completions/max_terminated_length": 828.0, + "completions/mean_length": 437.0625, + "completions/mean_terminated_length": 410.5000228881836, + "completions/min_length": 122.25, + "completions/min_terminated_length": 122.25, + "epoch": 0.537, + "grad_norm": 3.164299488067627, + "kl": 21.203125, + "learning_rate": 1.0471064507096427e-05, + "loss": 1.7187, + "num_tokens": 33513310.0, + "reward": 1.01171875, + "reward_std": 0.8858509808778763, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.421875, + "rewards/format_reward/std": 0.48558124154806137, + "rewards/tag_count_reward/mean": 0.57421875, + "rewards/tag_count_reward/std": 0.4041188284754753, + "step": 1074, + "token_counts/after_target": 1782.5, + "token_counts/after_think": 186.5, + "token_counts/before_target": 2748.75, + "token_counts/before_think": 2275.25 + }, + { + "avg_penalty/after_target": 2.3634583950042725, + "avg_penalty/after_think": 2.8783379793167114, + "avg_penalty/before_target": 0.6557421311736107, + "avg_penalty/before_think": 0.5769175514578819, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 872.0, + "completions/max_terminated_length": 761.25, + "completions/mean_length": 358.640625, + "completions/mean_terminated_length": 338.1166763305664, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.5375, + "grad_norm": 3.996779203414917, + "kl": 23.671875, + "learning_rate": 1.0453629881292537e-05, + "loss": 1.9635, + "num_tokens": 33548631.0, + "reward": 1.19921875, + "reward_std": 0.8689164519309998, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.48558124154806137, + "rewards/tag_count_reward/mean": 0.65234375, + "rewards/tag_count_reward/std": 0.40948084741830826, + "step": 1075, + "token_counts/after_target": 1344.75, + "token_counts/after_think": 156.25, + "token_counts/before_target": 2287.0, + "token_counts/before_think": 1950.25 + }, + { + "avg_penalty/after_target": 2.6511752009391785, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5006367340683937, + "avg_penalty/before_think": 0.5332556739449501, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.5, + "completions/max_terminated_length": 656.5, + "completions/mean_length": 298.140625, + "completions/mean_terminated_length": 298.140625, + "completions/min_length": 58.25, + "completions/min_terminated_length": 58.25, + "epoch": 0.538, + "grad_norm": 3.4931113719940186, + "kl": 19.28125, + "learning_rate": 1.0436193873653362e-05, + "loss": 1.6907, + "num_tokens": 33577680.0, + "reward": 1.09765625, + "reward_std": 0.8904990553855896, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.48935678601264954, + "rewards/tag_count_reward/mean": 0.59765625, + "rewards/tag_count_reward/std": 0.42491596937179565, + "step": 1076, + "token_counts/after_target": 1056.75, + "token_counts/after_think": 160.0, + "token_counts/before_target": 2129.5, + "token_counts/before_think": 1424.0 + }, + { + "avg_penalty/after_target": 2.431126654148102, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3970876783132553, + "avg_penalty/before_think": 0.5399705395102501, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 657.75, + "completions/max_terminated_length": 657.75, + "completions/mean_length": 295.6875, + "completions/mean_terminated_length": 295.6875, + "completions/min_length": 41.75, + "completions/min_terminated_length": 41.75, + "epoch": 0.5385, + "grad_norm": 6.335768222808838, + "kl": 18.765625, + "learning_rate": 1.0418756537291996e-05, + "loss": 1.4619, + "num_tokens": 33606172.0, + "reward": 1.24609375, + "reward_std": 0.9231271296739578, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.500852182507515, + "rewards/tag_count_reward/mean": 0.66796875, + "rewards/tag_count_reward/std": 0.4558524265885353, + "step": 1077, + "token_counts/after_target": 865.0, + "token_counts/after_think": 155.0, + "token_counts/before_target": 2015.75, + "token_counts/before_think": 1695.25 + }, + { + "avg_penalty/after_target": 3.1366668045520782, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.35232095047831535, + "avg_penalty/before_think": 0.39531004801392555, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.5, + "completions/max_terminated_length": 519.5, + "completions/mean_length": 232.890625, + "completions/mean_terminated_length": 232.890625, + "completions/min_length": 58.25, + "completions/min_terminated_length": 58.25, + "epoch": 0.539, + "grad_norm": 4.20374870300293, + "kl": 18.78125, + "learning_rate": 1.0401317925325598e-05, + "loss": 1.6535, + "num_tokens": 33633637.0, + "reward": 1.1875, + "reward_std": 0.917208269238472, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.5625, + "rewards/format_reward/std": 0.4909028485417366, + "rewards/tag_count_reward/mean": 0.609375, + "rewards/tag_count_reward/std": 0.4279505982995033, + "step": 1078, + "token_counts/after_target": 743.75, + "token_counts/after_think": 87.25, + "token_counts/before_target": 1640.5, + "token_counts/before_think": 1254.75 + }, + { + "avg_penalty/after_target": 2.4496299624443054, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.38248787075281143, + "avg_penalty/before_think": 0.7015295848250389, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.75, + "completions/max_terminated_length": 550.75, + "completions/mean_length": 280.78125, + "completions/mean_terminated_length": 280.78125, + "completions/min_length": 55.75, + "completions/min_terminated_length": 55.75, + "epoch": 0.5395, + "grad_norm": 3.319000482559204, + "kl": 15.96875, + "learning_rate": 1.03838780908752e-05, + "loss": 1.471, + "num_tokens": 33658743.0, + "reward": 1.38671875, + "reward_std": 0.8888215124607086, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.48148179799318314, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.4168490394949913, + "step": 1079, + "token_counts/after_target": 822.75, + "token_counts/after_think": 187.75, + "token_counts/before_target": 1963.75, + "token_counts/before_think": 1518.25 + }, + { + "avg_penalty/after_target": 2.1522293984889984, + "avg_penalty/after_think": 3.9034499526023865, + "avg_penalty/before_target": 0.36190135031938553, + "avg_penalty/before_think": 0.5359638631343842, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 288.96875, + "completions/mean_terminated_length": 288.96875, + "completions/min_length": 69.75, + "completions/min_terminated_length": 69.75, + "epoch": 0.54, + "grad_norm": 5.864862442016602, + "kl": 20.484375, + "learning_rate": 1.0366437087065564e-05, + "loss": 1.591, + "num_tokens": 33686469.0, + "reward": 1.28125, + "reward_std": 0.8681564331054688, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.47354350984096527, + "rewards/tag_count_reward/mean": 0.671875, + "rewards/tag_count_reward/std": 0.4125172048807144, + "step": 1080, + "token_counts/after_target": 688.25, + "token_counts/after_think": 218.75, + "token_counts/before_target": 2447.75, + "token_counts/before_think": 1268.75 + }, + { + "avg_penalty/after_target": 2.5712877213954926, + "avg_penalty/after_think": 3.99794340133667, + "avg_penalty/before_target": 0.4297579973936081, + "avg_penalty/before_think": 0.5323378965258598, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 598.75, + "completions/max_terminated_length": 539.25, + "completions/mean_length": 268.171875, + "completions/mean_terminated_length": 257.8687515258789, + "completions/min_length": 45.75, + "completions/min_terminated_length": 45.75, + "epoch": 0.5405, + "grad_norm": 3.2128050327301025, + "kl": 14.5625, + "learning_rate": 1.0348994967025012e-05, + "loss": 1.3211, + "num_tokens": 33713200.0, + "reward": 1.40234375, + "reward_std": 0.8411922007799149, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.45916909724473953, + "rewards/tag_count_reward/mean": 0.73046875, + "rewards/tag_count_reward/std": 0.3972490504384041, + "step": 1081, + "token_counts/after_target": 815.0, + "token_counts/after_think": 96.0, + "token_counts/before_target": 1722.75, + "token_counts/before_think": 1657.0 + }, + { + "avg_penalty/after_target": 2.6533923745155334, + "avg_penalty/after_think": 2.7350987792015076, + "avg_penalty/before_target": 0.3156989775598049, + "avg_penalty/before_think": 0.5529560744762421, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.5, + "completions/max_terminated_length": 487.5, + "completions/mean_length": 231.25, + "completions/mean_terminated_length": 231.25, + "completions/min_length": 44.25, + "completions/min_terminated_length": 44.25, + "epoch": 0.541, + "grad_norm": 7.87031888961792, + "kl": 19.484375, + "learning_rate": 1.0331551783885263e-05, + "loss": 1.4445, + "num_tokens": 33739728.0, + "reward": 1.2734375, + "reward_std": 0.9096841961145401, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.4896806851029396, + "rewards/tag_count_reward/mean": 0.6640625, + "rewards/tag_count_reward/std": 0.4374951049685478, + "step": 1082, + "token_counts/after_target": 499.5, + "token_counts/after_think": 76.25, + "token_counts/before_target": 2305.75, + "token_counts/before_think": 818.5 + }, + { + "avg_penalty/after_target": 2.494377940893173, + "avg_penalty/after_think": 3.5635963082313538, + "avg_penalty/before_target": 0.32490580901503563, + "avg_penalty/before_think": 0.49049990624189377, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 229.65625, + "completions/mean_terminated_length": 229.65625, + "completions/min_length": 92.75, + "completions/min_terminated_length": 92.75, + "epoch": 0.5415, + "grad_norm": 2.4326796531677246, + "kl": 12.328125, + "learning_rate": 1.0314107590781284e-05, + "loss": 1.1603, + "num_tokens": 33763370.0, + "reward": 1.53125, + "reward_std": 0.7722885310649872, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4519384130835533, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.34800300747156143, + "step": 1083, + "token_counts/after_target": 414.5, + "token_counts/after_think": 152.5, + "token_counts/before_target": 1669.5, + "token_counts/before_think": 1438.0 + }, + { + "avg_penalty/after_target": 2.0928693413734436, + "avg_penalty/after_think": 3.8323493003845215, + "avg_penalty/before_target": 0.43351496011018753, + "avg_penalty/before_think": 0.5449445471167564, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 747.5, + "completions/max_terminated_length": 595.0, + "completions/mean_length": 254.75, + "completions/mean_terminated_length": 242.0875015258789, + "completions/min_length": 42.5, + "completions/min_terminated_length": 42.5, + "epoch": 0.542, + "grad_norm": 4.2176313400268555, + "kl": 20.296875, + "learning_rate": 1.0296662440851108e-05, + "loss": 1.6862, + "num_tokens": 33791098.0, + "reward": 1.30078125, + "reward_std": 0.8564379513263702, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.4871220737695694, + "rewards/tag_count_reward/mean": 0.67578125, + "rewards/tag_count_reward/std": 0.4031810238957405, + "step": 1084, + "token_counts/after_target": 819.5, + "token_counts/after_think": 91.25, + "token_counts/before_target": 2080.0, + "token_counts/before_think": 1085.25 + }, + { + "avg_penalty/after_target": 2.647818773984909, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.35741814970970154, + "avg_penalty/before_think": 0.39753811806440353, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.5, + "completions/max_terminated_length": 585.5, + "completions/mean_length": 238.390625, + "completions/mean_terminated_length": 238.390625, + "completions/min_length": 86.75, + "completions/min_terminated_length": 86.75, + "epoch": 0.5425, + "grad_norm": 2.713416814804077, + "kl": 21.53125, + "learning_rate": 1.0279216387235691e-05, + "loss": 1.791, + "num_tokens": 33814835.0, + "reward": 1.203125, + "reward_std": 0.9423228949308395, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.5030868947505951, + "rewards/tag_count_reward/mean": 0.625, + "rewards/tag_count_reward/std": 0.45975086838006973, + "step": 1085, + "token_counts/after_target": 702.25, + "token_counts/after_think": 88.5, + "token_counts/before_target": 2218.5, + "token_counts/before_think": 805.0 + }, + { + "avg_penalty/after_target": 2.4872944056987762, + "avg_penalty/after_think": 2.7598379254341125, + "avg_penalty/before_target": 0.3064771518111229, + "avg_penalty/before_think": 0.512530654668808, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.75, + "completions/max_terminated_length": 571.75, + "completions/mean_length": 232.375, + "completions/mean_terminated_length": 232.375, + "completions/min_length": 63.25, + "completions/min_terminated_length": 63.25, + "epoch": 0.543, + "grad_norm": 2.9246280193328857, + "kl": 15.75, + "learning_rate": 1.0261769483078734e-05, + "loss": 1.3447, + "num_tokens": 33841723.0, + "reward": 1.4140625, + "reward_std": 0.8328434377908707, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.457730233669281, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.3914523273706436, + "step": 1086, + "token_counts/after_target": 566.0, + "token_counts/after_think": 109.5, + "token_counts/before_target": 2090.25, + "token_counts/before_think": 952.25 + }, + { + "avg_penalty/after_target": 2.3034898042678833, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3756170868873596, + "avg_penalty/before_think": 0.7334326356649399, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 730.25, + "completions/max_terminated_length": 632.5, + "completions/mean_length": 272.796875, + "completions/mean_terminated_length": 261.1729202270508, + "completions/min_length": 46.75, + "completions/min_terminated_length": 46.75, + "epoch": 0.5435, + "grad_norm": 7.214227676391602, + "kl": 16.546875, + "learning_rate": 1.0244321781526533e-05, + "loss": 1.6159, + "num_tokens": 33873134.0, + "reward": 1.4453125, + "reward_std": 0.8437733799219131, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.46513500809669495, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.4030057415366173, + "step": 1087, + "token_counts/after_target": 886.5, + "token_counts/after_think": 205.5, + "token_counts/before_target": 2193.75, + "token_counts/before_think": 1079.0 + }, + { + "avg_penalty/after_target": 3.0144742131233215, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.29596995934844017, + "avg_penalty/before_think": 0.4841551184654236, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.5, + "completions/max_terminated_length": 540.5, + "completions/mean_length": 231.84375, + "completions/mean_terminated_length": 231.84375, + "completions/min_length": 58.25, + "completions/min_terminated_length": 58.25, + "epoch": 0.544, + "grad_norm": 3.0268537998199463, + "kl": 10.671875, + "learning_rate": 1.0226873335727815e-05, + "loss": 1.0575, + "num_tokens": 33895860.0, + "reward": 1.578125, + "reward_std": 0.7773305922746658, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.43303824216127396, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.36216578632593155, + "step": 1088, + "token_counts/after_target": 399.0, + "token_counts/after_think": 92.75, + "token_counts/before_target": 1889.25, + "token_counts/before_think": 1328.5 + }, + { + "avg_penalty/after_target": 2.9999843537807465, + "avg_penalty/after_think": 3.930896520614624, + "avg_penalty/before_target": 0.25809627771377563, + "avg_penalty/before_think": 0.3425496146082878, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.25, + "completions/max_terminated_length": 546.25, + "completions/mean_length": 226.703125, + "completions/mean_terminated_length": 226.703125, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.5445, + "grad_norm": 5.94439697265625, + "kl": 13.6953125, + "learning_rate": 1.0209424198833571e-05, + "loss": 1.2996, + "num_tokens": 33920209.0, + "reward": 1.5625, + "reward_std": 0.7677344530820847, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.36943431943655014, + "step": 1089, + "token_counts/after_target": 413.75, + "token_counts/after_think": 151.25, + "token_counts/before_target": 2130.0, + "token_counts/before_think": 932.25 + }, + { + "avg_penalty/after_target": 2.3733585476875305, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5414149686694145, + "avg_penalty/before_think": 0.6045780628919601, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 662.0, + "completions/max_terminated_length": 654.5, + "completions/mean_length": 294.1875, + "completions/mean_terminated_length": 272.9665222167969, + "completions/min_length": 48.25, + "completions/min_terminated_length": 48.25, + "epoch": 0.545, + "grad_norm": 8.222551345825195, + "kl": 14.2421875, + "learning_rate": 1.01919744239969e-05, + "loss": 1.5523, + "num_tokens": 33946989.0, + "reward": 1.5859375, + "reward_std": 0.7433001697063446, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4154609143733978, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.3471912443637848, + "step": 1090, + "token_counts/after_target": 1102.5, + "token_counts/after_think": 143.25, + "token_counts/before_target": 2145.75, + "token_counts/before_think": 1315.5 + }, + { + "avg_penalty/after_target": 2.9394713640213013, + "avg_penalty/after_think": 2.8697453141212463, + "avg_penalty/before_target": 0.3202101103961468, + "avg_penalty/before_think": 0.58688173443079, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.0, + "completions/max_terminated_length": 549.0, + "completions/mean_length": 221.734375, + "completions/mean_terminated_length": 221.734375, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.5455, + "grad_norm": 4.4257612228393555, + "kl": 11.87890625, + "learning_rate": 1.0174524064372837e-05, + "loss": 1.184, + "num_tokens": 33969548.0, + "reward": 1.671875, + "reward_std": 0.625297337770462, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.890625, + "rewards/tag_count_reward/std": 0.25091201812028885, + "step": 1091, + "token_counts/after_target": 537.5, + "token_counts/after_think": 42.0, + "token_counts/before_target": 1842.25, + "token_counts/before_think": 1126.0 + }, + { + "avg_penalty/after_target": 2.2733685672283173, + "avg_penalty/after_think": 2.51836895942688, + "avg_penalty/before_target": 0.41143982112407684, + "avg_penalty/before_think": 0.48031000792980194, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 617.5, + "completions/max_terminated_length": 468.75, + "completions/mean_length": 183.1875, + "completions/mean_terminated_length": 169.71041870117188, + "completions/min_length": 50.75, + "completions/min_terminated_length": 50.75, + "epoch": 0.546, + "grad_norm": 4.185104846954346, + "kl": 21.703125, + "learning_rate": 1.0157073173118207e-05, + "loss": 1.7521, + "num_tokens": 33990376.0, + "reward": 1.53125, + "reward_std": 0.8099117875099182, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4440634250640869, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.3827696070075035, + "step": 1092, + "token_counts/after_target": 549.75, + "token_counts/after_think": 59.25, + "token_counts/before_target": 1525.25, + "token_counts/before_think": 796.75 + }, + { + "avg_penalty/after_target": 2.8516887724399567, + "avg_penalty/after_think": 3.7966654300689697, + "avg_penalty/before_target": 0.25763319805264473, + "avg_penalty/before_think": 0.6001767441630363, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 629.0, + "completions/max_terminated_length": 629.0, + "completions/mean_length": 203.125, + "completions/mean_terminated_length": 203.125, + "completions/min_length": 31.25, + "completions/min_terminated_length": 31.25, + "epoch": 0.5465, + "grad_norm": 9.847187995910645, + "kl": 23.03125, + "learning_rate": 1.0139621803391454e-05, + "loss": 1.6614, + "num_tokens": 34012016.0, + "reward": 1.42578125, + "reward_std": 0.9656107574701309, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.11967839300632477, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.47987766563892365, + "rewards/tag_count_reward/mean": 0.70703125, + "rewards/tag_count_reward/std": 0.4094977080821991, + "step": 1093, + "token_counts/after_target": 489.0, + "token_counts/after_think": 46.0, + "token_counts/before_target": 1842.0, + "token_counts/before_think": 873.0 + }, + { + "avg_penalty/after_target": 3.2836544513702393, + "avg_penalty/after_think": 3.9452247619628906, + "avg_penalty/before_target": 0.20858677476644516, + "avg_penalty/before_think": 0.38753243908286095, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.5, + "completions/max_terminated_length": 453.5, + "completions/mean_length": 157.953125, + "completions/mean_terminated_length": 157.953125, + "completions/min_length": 43.75, + "completions/min_terminated_length": 43.75, + "epoch": 0.547, + "grad_norm": 6.831762790679932, + "kl": 17.46875, + "learning_rate": 1.0122170008352472e-05, + "loss": 1.2552, + "num_tokens": 34032605.0, + "reward": 1.52734375, + "reward_std": 0.772891104221344, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4383598491549492, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.3486095815896988, + "step": 1094, + "token_counts/after_target": 177.75, + "token_counts/after_think": 51.0, + "token_counts/before_target": 1415.75, + "token_counts/before_think": 882.75 + }, + { + "avg_penalty/after_target": 2.3509587347507477, + "avg_penalty/after_think": 1.8262760043144226, + "avg_penalty/before_target": 0.29198940843343735, + "avg_penalty/before_think": 0.3795483186841011, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 157.4375, + "completions/mean_terminated_length": 157.4375, + "completions/min_length": 35.75, + "completions/min_terminated_length": 35.75, + "epoch": 0.5475, + "grad_norm": 10.013611793518066, + "kl": 21.46875, + "learning_rate": 1.010471784116246e-05, + "loss": 1.4149, + "num_tokens": 34054393.0, + "reward": 1.32421875, + "reward_std": 0.8992387354373932, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.48456869274377823, + "rewards/tag_count_reward/mean": 0.66796875, + "rewards/tag_count_reward/std": 0.4259954020380974, + "step": 1095, + "token_counts/after_target": 398.75, + "token_counts/after_think": 24.75, + "token_counts/before_target": 1503.75, + "token_counts/before_think": 591.75 + }, + { + "avg_penalty/after_target": 2.0993039309978485, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3784142956137657, + "avg_penalty/before_think": 0.537875734269619, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 618.25, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 207.265625, + "completions/mean_terminated_length": 193.86666870117188, + "completions/min_length": 45.5, + "completions/min_terminated_length": 45.5, + "epoch": 0.548, + "grad_norm": 3.967379093170166, + "kl": 19.34375, + "learning_rate": 1.008726535498374e-05, + "loss": 1.6814, + "num_tokens": 34077210.0, + "reward": 1.52734375, + "reward_std": 0.7667472511529922, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.43655145168304443, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.3487986996769905, + "step": 1096, + "token_counts/after_target": 439.75, + "token_counts/after_think": 141.0, + "token_counts/before_target": 1940.5, + "token_counts/before_think": 795.0 + }, + { + "avg_penalty/after_target": 2.9576149582862854, + "avg_penalty/after_think": 3.6270273327827454, + "avg_penalty/before_target": 0.3750374875962734, + "avg_penalty/before_think": 0.4743715077638626, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.25, + "completions/max_terminated_length": 555.25, + "completions/mean_length": 182.4375, + "completions/mean_terminated_length": 182.4375, + "completions/min_length": 61.5, + "completions/min_terminated_length": 61.5, + "epoch": 0.5485, + "grad_norm": 3.214695930480957, + "kl": 16.5234375, + "learning_rate": 1.0069812602979617e-05, + "loss": 1.4516, + "num_tokens": 34098278.0, + "reward": 1.53125, + "reward_std": 0.8064665794372559, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.43655145168304443, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.3659198731184006, + "step": 1097, + "token_counts/after_target": 471.0, + "token_counts/after_think": 54.75, + "token_counts/before_target": 1594.75, + "token_counts/before_think": 798.5 + }, + { + "avg_penalty/after_target": 2.466766208410263, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3398231230676174, + "avg_penalty/before_think": 0.37833222001791, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.5, + "completions/max_terminated_length": 568.5, + "completions/mean_length": 231.25, + "completions/mean_terminated_length": 231.25, + "completions/min_length": 34.25, + "completions/min_terminated_length": 34.25, + "epoch": 0.549, + "grad_norm": 5.164125919342041, + "kl": 17.46875, + "learning_rate": 1.0052359638314195e-05, + "loss": 1.317, + "num_tokens": 34124838.0, + "reward": 1.4765625, + "reward_std": 0.8569705337285995, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4604102149605751, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.4058806970715523, + "step": 1098, + "token_counts/after_target": 543.25, + "token_counts/after_think": 138.0, + "token_counts/before_target": 1920.0, + "token_counts/before_think": 1098.75 + }, + { + "avg_penalty/after_target": 2.580375850200653, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.5190240181982517, + "avg_penalty/before_think": 0.463306188583374, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 676.0, + "completions/max_terminated_length": 676.0, + "completions/mean_length": 231.703125, + "completions/mean_terminated_length": 231.703125, + "completions/min_length": 60.75, + "completions/min_terminated_length": 60.75, + "epoch": 0.5495, + "grad_norm": 2.830512046813965, + "kl": 25.3125, + "learning_rate": 1.0034906514152239e-05, + "loss": 2.1335, + "num_tokens": 34148675.0, + "reward": 1.3046875, + "reward_std": 0.8987140953540802, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.4819520115852356, + "rewards/tag_count_reward/mean": 0.6640625, + "rewards/tag_count_reward/std": 0.42989596724510193, + "step": 1099, + "token_counts/after_target": 1085.0, + "token_counts/after_think": 22.5, + "token_counts/before_target": 1968.5, + "token_counts/before_think": 631.25 + }, + { + "avg_penalty/after_target": 2.971705198287964, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.32025399059057236, + "avg_penalty/before_think": 0.36007795482873917, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.25, + "completions/max_terminated_length": 553.25, + "completions/mean_length": 238.734375, + "completions/mean_terminated_length": 238.734375, + "completions/min_length": 35.25, + "completions/min_terminated_length": 35.25, + "epoch": 0.55, + "grad_norm": 5.6207122802734375, + "kl": 14.28125, + "learning_rate": 1.0017453283658984e-05, + "loss": 1.3723, + "num_tokens": 34176178.0, + "reward": 1.55078125, + "reward_std": 0.8241044133901596, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.43655145168304443, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.38369932770729065, + "step": 1100, + "token_counts/after_target": 588.75, + "token_counts/after_think": 119.75, + "token_counts/before_target": 1920.0, + "token_counts/before_think": 1191.25 + }, + { + "avg_penalty/after_target": 2.414882242679596, + "avg_penalty/after_think": 2.9893811345100403, + "avg_penalty/before_target": 0.38706928864121437, + "avg_penalty/before_think": 0.4564807042479515, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 730.0, + "completions/max_terminated_length": 730.0, + "completions/mean_length": 229.15625, + "completions/mean_terminated_length": 229.15625, + "completions/min_length": 45.25, + "completions/min_terminated_length": 45.25, + "epoch": 0.5505, + "grad_norm": 4.446382999420166, + "kl": 22.65625, + "learning_rate": 1e-05, + "loss": 1.7872, + "num_tokens": 34199852.0, + "reward": 1.3125, + "reward_std": 0.893057569861412, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.4876555874943733, + "rewards/tag_count_reward/mean": 0.671875, + "rewards/tag_count_reward/std": 0.4213961809873581, + "step": 1101, + "token_counts/after_target": 706.5, + "token_counts/after_think": 34.0, + "token_counts/before_target": 2308.0, + "token_counts/before_think": 618.0 + }, + { + "avg_penalty/after_target": 3.2723769545555115, + "avg_penalty/after_think": 0.6341972351074219, + "avg_penalty/before_target": 0.39041245728731155, + "avg_penalty/before_think": 0.5266465246677399, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 770.25, + "completions/max_terminated_length": 629.5, + "completions/mean_length": 255.921875, + "completions/mean_terminated_length": 243.59375381469727, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.551, + "grad_norm": 10.015420913696289, + "kl": 19.171875, + "learning_rate": 9.982546716341019e-06, + "loss": 1.8483, + "num_tokens": 34226471.0, + "reward": 1.5078125, + "reward_std": 0.801092267036438, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4519384130835533, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.3797244429588318, + "step": 1102, + "token_counts/after_target": 930.5, + "token_counts/after_think": 80.25, + "token_counts/before_target": 1644.0, + "token_counts/before_think": 1440.0 + }, + { + "avg_penalty/after_target": 2.150133639574051, + "avg_penalty/after_think": 3.8499354124069214, + "avg_penalty/before_target": 0.3870435729622841, + "avg_penalty/before_think": 0.46429888904094696, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.5, + "completions/max_terminated_length": 611.5, + "completions/mean_length": 222.203125, + "completions/mean_terminated_length": 222.203125, + "completions/min_length": 59.5, + "completions/min_terminated_length": 59.5, + "epoch": 0.5515, + "grad_norm": 8.817916870117188, + "kl": 9.4453125, + "learning_rate": 9.965093485847766e-06, + "loss": 1.2395, + "num_tokens": 34252324.0, + "reward": 1.68359375, + "reward_std": 0.6758682429790497, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38336414843797684, + "rewards/tag_count_reward/mean": 0.85546875, + "rewards/tag_count_reward/std": 0.3240770101547241, + "step": 1103, + "token_counts/after_target": 595.25, + "token_counts/after_think": 116.5, + "token_counts/before_target": 1911.25, + "token_counts/before_think": 932.25 + }, + { + "avg_penalty/after_target": 3.2258068323135376, + "avg_penalty/after_think": 1.7929168939590454, + "avg_penalty/before_target": 0.35020309686660767, + "avg_penalty/before_think": 0.510734036564827, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 643.75, + "completions/max_terminated_length": 553.5, + "completions/mean_length": 235.28125, + "completions/mean_terminated_length": 224.59062957763672, + "completions/min_length": 72.5, + "completions/min_terminated_length": 72.5, + "epoch": 0.552, + "grad_norm": 6.540412902832031, + "kl": 20.6796875, + "learning_rate": 9.947640361685805e-06, + "loss": 1.9059, + "num_tokens": 34280838.0, + "reward": 1.53125, + "reward_std": 0.7616227716207504, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.42206869274377823, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.3638370484113693, + "step": 1104, + "token_counts/after_target": 833.25, + "token_counts/after_think": 100.0, + "token_counts/before_target": 2098.25, + "token_counts/before_think": 733.0 + }, + { + "avg_penalty/after_target": 2.130323737859726, + "avg_penalty/after_think": 3.2361637353897095, + "avg_penalty/before_target": 0.4725203774869442, + "avg_penalty/before_think": 0.6196320056915283, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 782.75, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 260.484375, + "completions/mean_terminated_length": 248.2458381652832, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.5525, + "grad_norm": 2.4834611415863037, + "kl": 14.17578125, + "learning_rate": 9.930187397020385e-06, + "loss": 1.3763, + "num_tokens": 34308469.0, + "reward": 1.63671875, + "reward_std": 0.7063456699252129, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.3979102149605751, + "rewards/tag_count_reward/mean": 0.83984375, + "rewards/tag_count_reward/std": 0.3259999752044678, + "step": 1105, + "token_counts/after_target": 897.75, + "token_counts/after_think": 95.0, + "token_counts/before_target": 1748.75, + "token_counts/before_think": 1426.25 + }, + { + "avg_penalty/after_target": 2.5218533873558044, + "avg_penalty/after_think": 2.7204445600509644, + "avg_penalty/before_target": 0.5516486763954163, + "avg_penalty/before_think": 0.5875196307897568, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.0, + "completions/max_terminated_length": 646.0, + "completions/mean_length": 265.453125, + "completions/mean_terminated_length": 265.453125, + "completions/min_length": 57.75, + "completions/min_terminated_length": 57.75, + "epoch": 0.553, + "grad_norm": 9.871855735778809, + "kl": 16.109375, + "learning_rate": 9.912734645016262e-06, + "loss": 1.7891, + "num_tokens": 34335970.0, + "reward": 1.71484375, + "reward_std": 0.6704410463571548, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.34944770485162735, + "rewards/tag_count_reward/mean": 0.85546875, + "rewards/tag_count_reward/std": 0.32596971839666367, + "step": 1106, + "token_counts/after_target": 1079.75, + "token_counts/after_think": 37.75, + "token_counts/before_target": 1728.0, + "token_counts/before_think": 1401.75 + }, + { + "avg_penalty/after_target": 2.1533412635326385, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3599850796163082, + "avg_penalty/before_think": 0.5089019313454628, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 596.0, + "completions/max_terminated_length": 529.75, + "completions/mean_length": 269.421875, + "completions/mean_terminated_length": 258.7135467529297, + "completions/min_length": 65.25, + "completions/min_terminated_length": 65.25, + "epoch": 0.5535, + "grad_norm": 3.7692437171936035, + "kl": 13.15478515625, + "learning_rate": 9.895282158837545e-06, + "loss": 1.2353, + "num_tokens": 34366701.0, + "reward": 1.625, + "reward_std": 0.6062939837574959, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.3300696536898613, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.29131544567644596, + "step": 1107, + "token_counts/after_target": 719.75, + "token_counts/after_think": 144.25, + "token_counts/before_target": 1808.5, + "token_counts/before_think": 1638.25 + }, + { + "avg_penalty/after_target": 2.5654184222221375, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3488526940345764, + "avg_penalty/before_think": 0.4784020632505417, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.5, + "completions/max_terminated_length": 453.5, + "completions/mean_length": 201.28125, + "completions/mean_terminated_length": 201.28125, + "completions/min_length": 56.75, + "completions/min_terminated_length": 56.75, + "epoch": 0.554, + "grad_norm": 8.741196632385254, + "kl": 15.953125, + "learning_rate": 9.877829991647528e-06, + "loss": 1.1104, + "num_tokens": 34387535.0, + "reward": 1.515625, + "reward_std": 0.8593188673257828, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4519384130835533, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.4017387703061104, + "step": 1108, + "token_counts/after_target": 506.5, + "token_counts/after_think": 32.0, + "token_counts/before_target": 1897.5, + "token_counts/before_think": 784.5 + }, + { + "avg_penalty/after_target": 2.620893955230713, + "avg_penalty/after_think": 2.8813765048980713, + "avg_penalty/before_target": 0.30284298956394196, + "avg_penalty/before_think": 0.6238695681095123, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.5, + "completions/max_terminated_length": 505.5, + "completions/mean_length": 240.09375, + "completions/mean_terminated_length": 240.09375, + "completions/min_length": 61.75, + "completions/min_terminated_length": 61.75, + "epoch": 0.5545, + "grad_norm": 11.691778182983398, + "kl": 21.25, + "learning_rate": 9.860378196608549e-06, + "loss": 1.5375, + "num_tokens": 34410853.0, + "reward": 1.4609375, + "reward_std": 0.8667977899312973, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4761601909995079, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.39960990101099014, + "step": 1109, + "token_counts/after_target": 734.75, + "token_counts/after_think": 59.0, + "token_counts/before_target": 2254.25, + "token_counts/before_think": 793.5 + }, + { + "avg_penalty/after_target": 1.8203796446323395, + "avg_penalty/after_think": 3.6264198422431946, + "avg_penalty/before_target": 0.4104568213224411, + "avg_penalty/before_think": 0.7866656705737114, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.5, + "completions/max_terminated_length": 613.5, + "completions/mean_length": 273.21875, + "completions/mean_terminated_length": 273.21875, + "completions/min_length": 48.75, + "completions/min_terminated_length": 48.75, + "epoch": 0.555, + "grad_norm": 5.532568454742432, + "kl": 15.4453125, + "learning_rate": 9.842926826881796e-06, + "loss": 1.3193, + "num_tokens": 34439235.0, + "reward": 1.65625, + "reward_std": 0.6515316367149353, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.859375, + "rewards/tag_count_reward/std": 0.28289176523685455, + "step": 1110, + "token_counts/after_target": 819.0, + "token_counts/after_think": 195.0, + "token_counts/before_target": 2094.75, + "token_counts/before_think": 1262.75 + }, + { + "avg_penalty/after_target": 2.324946165084839, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.41077806055545807, + "avg_penalty/before_think": 0.580903671681881, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 230.28125, + "completions/mean_terminated_length": 230.28125, + "completions/min_length": 40.5, + "completions/min_terminated_length": 40.5, + "epoch": 0.5555, + "grad_norm": 2.941483497619629, + "kl": 12.3671875, + "learning_rate": 9.825475935627165e-06, + "loss": 1.0495, + "num_tokens": 34463861.0, + "reward": 1.53515625, + "reward_std": 0.7390217483043671, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.43616948276758194, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.3395022414624691, + "step": 1111, + "token_counts/after_target": 772.75, + "token_counts/after_think": 92.5, + "token_counts/before_target": 1688.25, + "token_counts/before_think": 1131.0 + }, + { + "avg_penalty/after_target": 3.02224063873291, + "avg_penalty/after_think": 3.514655113220215, + "avg_penalty/before_target": 0.3955075219273567, + "avg_penalty/before_think": 0.5525281652808189, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 706.25, + "completions/max_terminated_length": 706.25, + "completions/mean_length": 291.859375, + "completions/mean_terminated_length": 291.859375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.556, + "grad_norm": 12.59017562866211, + "kl": 27.03125, + "learning_rate": 9.808025576003106e-06, + "loss": 2.0212, + "num_tokens": 34491580.0, + "reward": 1.36328125, + "reward_std": 0.7885788083076477, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.4612434431910515, + "rewards/tag_count_reward/mean": 0.72265625, + "rewards/tag_count_reward/std": 0.34893495962023735, + "step": 1112, + "token_counts/after_target": 862.5, + "token_counts/after_think": 148.0, + "token_counts/before_target": 2337.5, + "token_counts/before_think": 1321.75 + }, + { + "avg_penalty/after_target": 2.614722579717636, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3915090411901474, + "avg_penalty/before_think": 0.640851765871048, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.25, + "completions/max_terminated_length": 525.25, + "completions/mean_length": 282.671875, + "completions/mean_terminated_length": 282.671875, + "completions/min_length": 31.75, + "completions/min_terminated_length": 31.75, + "epoch": 0.5565, + "grad_norm": 4.062381744384766, + "kl": 15.1796875, + "learning_rate": 9.790575801166432e-06, + "loss": 1.3036, + "num_tokens": 34518983.0, + "reward": 1.5, + "reward_std": 0.7137896418571472, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.45247192680835724, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.3068903610110283, + "step": 1113, + "token_counts/after_target": 836.5, + "token_counts/after_think": 198.0, + "token_counts/before_target": 2294.5, + "token_counts/before_think": 1193.75 + }, + { + "avg_penalty/after_target": 2.733410805463791, + "avg_penalty/after_think": 2.6674219965934753, + "avg_penalty/before_target": 0.5984224230051041, + "avg_penalty/before_think": 0.535756453871727, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 854.0, + "completions/max_terminated_length": 806.75, + "completions/mean_length": 322.203125, + "completions/mean_terminated_length": 312.3416748046875, + "completions/min_length": 28.5, + "completions/min_terminated_length": 28.5, + "epoch": 0.557, + "grad_norm": 6.1666646003723145, + "kl": 25.703125, + "learning_rate": 9.773126664272186e-06, + "loss": 2.1146, + "num_tokens": 34549444.0, + "reward": 1.3046875, + "reward_std": 0.8544989824295044, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.5625, + "rewards/format_reward/std": 0.5059641748666763, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.39533529430627823, + "step": 1114, + "token_counts/after_target": 1587.0, + "token_counts/after_think": 46.25, + "token_counts/before_target": 2741.0, + "token_counts/before_think": 781.0 + }, + { + "avg_penalty/after_target": 2.033475309610367, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4836229607462883, + "avg_penalty/before_think": 0.5144027024507523, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 803.0, + "completions/max_terminated_length": 803.0, + "completions/mean_length": 329.796875, + "completions/mean_terminated_length": 329.796875, + "completions/min_length": 71.75, + "completions/min_terminated_length": 71.75, + "epoch": 0.5575, + "grad_norm": 8.398015975952148, + "kl": 21.53125, + "learning_rate": 9.75567821847347e-06, + "loss": 1.6221, + "num_tokens": 34581463.0, + "reward": 1.234375, + "reward_std": 0.7747030109167099, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.484375, + "rewards/format_reward/std": 0.500852182507515, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.32929322123527527, + "step": 1115, + "token_counts/after_target": 1073.25, + "token_counts/after_think": 40.75, + "token_counts/before_target": 2747.25, + "token_counts/before_think": 1415.5 + }, + { + "avg_penalty/after_target": 2.501337170600891, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5370336547493935, + "avg_penalty/before_think": 0.690081998705864, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.25, + "completions/max_terminated_length": 763.25, + "completions/mean_length": 361.0, + "completions/mean_terminated_length": 361.0, + "completions/min_length": 46.5, + "completions/min_terminated_length": 46.5, + "epoch": 0.558, + "grad_norm": 5.01569128036499, + "kl": 18.3125, + "learning_rate": 9.738230516921272e-06, + "loss": 1.6763, + "num_tokens": 34614967.0, + "reward": 1.36328125, + "reward_std": 0.8552103191614151, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.48558124154806137, + "rewards/tag_count_reward/mean": 0.72265625, + "rewards/tag_count_reward/std": 0.39600546658039093, + "step": 1116, + "token_counts/after_target": 1405.5, + "token_counts/after_think": 115.5, + "token_counts/before_target": 3143.0, + "token_counts/before_think": 1112.0 + }, + { + "avg_penalty/after_target": 2.8375200629234314, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3946327790617943, + "avg_penalty/before_think": 0.6488286629319191, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 810.0, + "completions/max_terminated_length": 538.75, + "completions/mean_length": 265.59375, + "completions/mean_terminated_length": 241.40000915527344, + "completions/min_length": 29.75, + "completions/min_terminated_length": 29.75, + "epoch": 0.5585, + "grad_norm": 2.841398000717163, + "kl": 18.7734375, + "learning_rate": 9.720783612764314e-06, + "loss": 1.6242, + "num_tokens": 34642733.0, + "reward": 1.37890625, + "reward_std": 0.8787551373243332, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.48935678601264954, + "rewards/tag_count_reward/mean": 0.72265625, + "rewards/tag_count_reward/std": 0.41772206127643585, + "step": 1117, + "token_counts/after_target": 1081.25, + "token_counts/after_think": 24.0, + "token_counts/before_target": 2231.0, + "token_counts/before_think": 913.25 + }, + { + "avg_penalty/after_target": 2.901383697986603, + "avg_penalty/after_think": 2.9008132815361023, + "avg_penalty/before_target": 0.5670506730675697, + "avg_penalty/before_think": 0.5479741841554642, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 773.0, + "completions/max_terminated_length": 530.5, + "completions/mean_length": 294.875, + "completions/mean_terminated_length": 272.57813262939453, + "completions/min_length": 63.75, + "completions/min_terminated_length": 63.75, + "epoch": 0.559, + "grad_norm": 14.217864990234375, + "kl": 12.7578125, + "learning_rate": 9.703337559148892e-06, + "loss": 1.5679, + "num_tokens": 34670181.0, + "reward": 1.61328125, + "reward_std": 0.6635110974311829, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.41419370472431183, + "rewards/tag_count_reward/mean": 0.86328125, + "rewards/tag_count_reward/std": 0.2814188450574875, + "step": 1118, + "token_counts/after_target": 1083.75, + "token_counts/after_think": 161.25, + "token_counts/before_target": 2144.5, + "token_counts/before_think": 1328.5 + }, + { + "avg_penalty/after_target": 2.31718847155571, + "avg_penalty/after_think": 3.966194748878479, + "avg_penalty/before_target": 0.6777081862092018, + "avg_penalty/before_think": 0.562414713203907, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 816.75, + "completions/max_terminated_length": 792.75, + "completions/mean_length": 345.71875, + "completions/mean_terminated_length": 336.5062561035156, + "completions/min_length": 53.25, + "completions/min_terminated_length": 53.25, + "epoch": 0.5595, + "grad_norm": 14.559903144836426, + "kl": 15.53125, + "learning_rate": 9.685892409218718e-06, + "loss": 1.8106, + "num_tokens": 34704099.0, + "reward": 1.56640625, + "reward_std": 0.7219556719064713, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42733466625213623, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.3228742480278015, + "step": 1119, + "token_counts/after_target": 1804.75, + "token_counts/after_think": 58.25, + "token_counts/before_target": 2428.0, + "token_counts/before_think": 1240.5 + }, + { + "avg_penalty/after_target": 2.397211790084839, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.4811251759529114, + "avg_penalty/before_think": 0.7641226947307587, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 856.25, + "completions/max_terminated_length": 770.75, + "completions/mean_length": 370.890625, + "completions/mean_terminated_length": 359.95521545410156, + "completions/min_length": 81.25, + "completions/min_terminated_length": 81.25, + "epoch": 0.56, + "grad_norm": 6.741021156311035, + "kl": 17.046875, + "learning_rate": 9.668448216114739e-06, + "loss": 1.618, + "num_tokens": 34737228.0, + "reward": 1.36328125, + "reward_std": 0.884574368596077, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.49345622956752777, + "rewards/tag_count_reward/mean": 0.72265625, + "rewards/tag_count_reward/std": 0.40549876540899277, + "step": 1120, + "token_counts/after_target": 1598.0, + "token_counts/after_think": 10.0, + "token_counts/before_target": 2438.5, + "token_counts/before_think": 1887.75 + }, + { + "avg_penalty/after_target": 2.62263485789299, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.34891626238822937, + "avg_penalty/before_think": 0.5831495821475983, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.5, + "completions/max_terminated_length": 547.5, + "completions/mean_length": 250.125, + "completions/mean_terminated_length": 250.125, + "completions/min_length": 47.25, + "completions/min_terminated_length": 47.25, + "epoch": 0.5605, + "grad_norm": 6.868743896484375, + "kl": 11.3125, + "learning_rate": 9.651005032974994e-06, + "loss": 1.182, + "num_tokens": 34762020.0, + "reward": 1.48828125, + "reward_std": 0.8449244797229767, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.46513500809669495, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.3839787244796753, + "step": 1121, + "token_counts/after_target": 744.25, + "token_counts/after_think": 187.0, + "token_counts/before_target": 2065.5, + "token_counts/before_think": 1005.25 + }, + { + "avg_penalty/after_target": 2.3434643745422363, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4161060303449631, + "avg_penalty/before_think": 0.5891213268041611, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 843.5, + "completions/max_terminated_length": 719.5, + "completions/mean_length": 338.5, + "completions/mean_terminated_length": 325.8666687011719, + "completions/min_length": 50.75, + "completions/min_terminated_length": 50.75, + "epoch": 0.561, + "grad_norm": 5.081603527069092, + "kl": 17.109375, + "learning_rate": 9.633562912934436e-06, + "loss": 1.5955, + "num_tokens": 34793492.0, + "reward": 1.4296875, + "reward_std": 0.7435154318809509, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.4665650501847267, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.3177653420716524, + "step": 1122, + "token_counts/after_target": 1220.5, + "token_counts/after_think": 107.75, + "token_counts/before_target": 2376.25, + "token_counts/before_think": 1711.5 + }, + { + "avg_penalty/after_target": 2.0949121713638306, + "avg_penalty/after_think": 3.560554265975952, + "avg_penalty/before_target": 0.31950508058071136, + "avg_penalty/before_think": 0.6525627300143242, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 755.75, + "completions/max_terminated_length": 647.25, + "completions/mean_length": 306.84375, + "completions/mean_terminated_length": 296.18333435058594, + "completions/min_length": 38.75, + "completions/min_terminated_length": 38.75, + "epoch": 0.5615, + "grad_norm": 9.483495712280273, + "kl": 19.546875, + "learning_rate": 9.616121909124801e-06, + "loss": 1.4152, + "num_tokens": 34826010.0, + "reward": 1.375, + "reward_std": 0.8090845197439194, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.4665650501847267, + "rewards/tag_count_reward/mean": 0.71875, + "rewards/tag_count_reward/std": 0.37025244161486626, + "step": 1123, + "token_counts/after_target": 663.0, + "token_counts/after_think": 194.25, + "token_counts/before_target": 2964.5, + "token_counts/before_think": 1087.75 + }, + { + "avg_penalty/after_target": 2.133454591035843, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.5353153571486473, + "avg_penalty/before_think": 0.5810292065143585, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 795.25, + "completions/max_terminated_length": 795.25, + "completions/mean_length": 349.453125, + "completions/mean_terminated_length": 349.453125, + "completions/min_length": 54.75, + "completions/min_terminated_length": 54.75, + "epoch": 0.562, + "grad_norm": 9.736372947692871, + "kl": 24.25, + "learning_rate": 9.598682074674405e-06, + "loss": 1.794, + "num_tokens": 34856775.0, + "reward": 1.23828125, + "reward_std": 0.8626738339662552, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.5625, + "rewards/format_reward/std": 0.49345622956752777, + "rewards/tag_count_reward/mean": 0.66015625, + "rewards/tag_count_reward/std": 0.39211705699563026, + "step": 1124, + "token_counts/after_target": 1277.5, + "token_counts/after_think": 260.25, + "token_counts/before_target": 2745.25, + "token_counts/before_think": 1308.25 + }, + { + "avg_penalty/after_target": 2.523219645023346, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.47980981320142746, + "avg_penalty/before_think": 0.7700952738523483, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 677.25, + "completions/max_terminated_length": 662.0, + "completions/mean_length": 310.359375, + "completions/mean_terminated_length": 300.31146240234375, + "completions/min_length": 51.75, + "completions/min_terminated_length": 51.75, + "epoch": 0.5625, + "grad_norm": 6.671932220458984, + "kl": 21.078125, + "learning_rate": 9.581243462708007e-06, + "loss": 1.6711, + "num_tokens": 34888286.0, + "reward": 1.3984375, + "reward_std": 0.855292022228241, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.47354350984096527, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.39848271012306213, + "step": 1125, + "token_counts/after_target": 1333.0, + "token_counts/after_think": 141.25, + "token_counts/before_target": 2496.25, + "token_counts/before_think": 995.25 + }, + { + "avg_penalty/after_target": 2.1852124631404877, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.48223328590393066, + "avg_penalty/before_think": 0.7066483646631241, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 797.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 352.90625, + "completions/mean_terminated_length": 344.08646392822266, + "completions/min_length": 29.25, + "completions/min_terminated_length": 29.25, + "epoch": 0.563, + "grad_norm": 8.50853157043457, + "kl": 21.546875, + "learning_rate": 9.563806126346643e-06, + "loss": 1.6068, + "num_tokens": 34919368.0, + "reward": 1.28125, + "reward_std": 0.808777466416359, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.53125, + "rewards/format_reward/std": 0.4713720977306366, + "rewards/tag_count_reward/mean": 0.71875, + "rewards/tag_count_reward/std": 0.37721574306488037, + "step": 1126, + "token_counts/after_target": 1345.5, + "token_counts/after_think": 331.0, + "token_counts/before_target": 2841.5, + "token_counts/before_think": 1128.5 + }, + { + "avg_penalty/after_target": 2.753769725561142, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3901730217039585, + "avg_penalty/before_think": 0.4427391439676285, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 785.75, + "completions/max_terminated_length": 785.75, + "completions/mean_length": 261.140625, + "completions/mean_terminated_length": 261.140625, + "completions/min_length": 44.25, + "completions/min_terminated_length": 44.25, + "epoch": 0.5635, + "grad_norm": 8.134421348571777, + "kl": 24.21875, + "learning_rate": 9.546370118707463e-06, + "loss": 1.8265, + "num_tokens": 34945089.0, + "reward": 1.26953125, + "reward_std": 0.9175145030021667, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.49244368076324463, + "rewards/tag_count_reward/mean": 0.66015625, + "rewards/tag_count_reward/std": 0.44396401196718216, + "step": 1127, + "token_counts/after_target": 884.25, + "token_counts/after_think": 30.5, + "token_counts/before_target": 2218.25, + "token_counts/before_think": 1045.25 + }, + { + "avg_penalty/after_target": 2.3817259073257446, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.4427913576364517, + "avg_penalty/before_think": 0.6689478084445, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 699.25, + "completions/max_terminated_length": 699.25, + "completions/mean_length": 305.125, + "completions/mean_terminated_length": 305.125, + "completions/min_length": 71.5, + "completions/min_terminated_length": 71.5, + "epoch": 0.564, + "grad_norm": 2.887479305267334, + "kl": 18.1875, + "learning_rate": 9.528935492903575e-06, + "loss": 1.5693, + "num_tokens": 34975049.0, + "reward": 1.3125, + "reward_std": 0.8133781403303146, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.46513500809669495, + "rewards/tag_count_reward/mean": 0.703125, + "rewards/tag_count_reward/std": 0.38217610120773315, + "step": 1128, + "token_counts/after_target": 968.25, + "token_counts/after_think": 135.25, + "token_counts/before_target": 2750.75, + "token_counts/before_think": 1027.75 + }, + { + "avg_penalty/after_target": 3.0403767824172974, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.39722198247909546, + "avg_penalty/before_think": 0.6838456615805626, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 718.75, + "completions/max_terminated_length": 693.25, + "completions/mean_length": 303.21875, + "completions/mean_terminated_length": 292.4031295776367, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.5645, + "grad_norm": 3.6261746883392334, + "kl": 19.4375, + "learning_rate": 9.511502302043867e-06, + "loss": 1.6889, + "num_tokens": 35013079.0, + "reward": 1.28125, + "reward_std": 0.897643193602562, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.4939897432923317, + "rewards/tag_count_reward/mean": 0.671875, + "rewards/tag_count_reward/std": 0.42315778136253357, + "step": 1129, + "token_counts/after_target": 1243.5, + "token_counts/after_think": 126.75, + "token_counts/before_target": 2561.75, + "token_counts/before_think": 919.5 + }, + { + "avg_penalty/after_target": 2.4436916410923004, + "avg_penalty/after_think": 1.7866724729537964, + "avg_penalty/before_target": 0.5436149910092354, + "avg_penalty/before_think": 0.5992405563592911, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 911.75, + "completions/max_terminated_length": 846.0, + "completions/mean_length": 291.71875, + "completions/mean_terminated_length": 281.3041687011719, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.565, + "grad_norm": 10.106761932373047, + "kl": 21.03125, + "learning_rate": 9.494070599232868e-06, + "loss": 2.0569, + "num_tokens": 35043797.0, + "reward": 1.34765625, + "reward_std": 0.9016165286302567, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.47663040459156036, + "rewards/tag_count_reward/mean": 0.69140625, + "rewards/tag_count_reward/std": 0.43545521050691605, + "step": 1130, + "token_counts/after_target": 1479.0, + "token_counts/after_think": 30.25, + "token_counts/before_target": 2313.25, + "token_counts/before_think": 845.0 + }, + { + "avg_penalty/after_target": 2.2445965707302094, + "avg_penalty/after_think": 2.6322319507598877, + "avg_penalty/before_target": 0.3188713416457176, + "avg_penalty/before_think": 0.5702590644359589, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.5, + "completions/max_terminated_length": 564.5, + "completions/mean_length": 270.3125, + "completions/mean_terminated_length": 270.3125, + "completions/min_length": 67.75, + "completions/min_terminated_length": 67.75, + "epoch": 0.5655, + "grad_norm": 3.475095510482788, + "kl": 18.5625, + "learning_rate": 9.476640437570562e-06, + "loss": 1.4843, + "num_tokens": 35068793.0, + "reward": 1.23046875, + "reward_std": 0.9571926444768906, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.5061737895011902, + "rewards/tag_count_reward/mean": 0.63671875, + "rewards/tag_count_reward/std": 0.46779290586709976, + "step": 1131, + "token_counts/after_target": 741.5, + "token_counts/after_think": 51.25, + "token_counts/before_target": 2395.75, + "token_counts/before_think": 1136.5 + }, + { + "avg_penalty/after_target": 1.6850363314151764, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.5116568058729172, + "avg_penalty/before_think": 0.480901263654232, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 848.75, + "completions/max_terminated_length": 789.25, + "completions/mean_length": 374.578125, + "completions/mean_terminated_length": 363.13958740234375, + "completions/min_length": 54.25, + "completions/min_terminated_length": 54.25, + "epoch": 0.566, + "grad_norm": 3.0286920070648193, + "kl": 19.078125, + "learning_rate": 9.459211870152247e-06, + "loss": 1.5806, + "num_tokens": 35103390.0, + "reward": 1.25, + "reward_std": 0.9069202095270157, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.49808918684720993, + "rewards/tag_count_reward/mean": 0.671875, + "rewards/tag_count_reward/std": 0.4490862414240837, + "step": 1132, + "token_counts/after_target": 1189.5, + "token_counts/after_think": 137.75, + "token_counts/before_target": 3324.75, + "token_counts/before_think": 1341.25 + }, + { + "avg_penalty/after_target": 2.3993547558784485, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.5519629567861557, + "avg_penalty/before_think": 0.4756855107843876, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.25, + "completions/max_terminated_length": 744.25, + "completions/mean_length": 262.203125, + "completions/mean_terminated_length": 262.203125, + "completions/min_length": 51.25, + "completions/min_terminated_length": 51.25, + "epoch": 0.5665, + "grad_norm": 9.034510612487793, + "kl": 20.703125, + "learning_rate": 9.441784950068362e-06, + "loss": 1.9736, + "num_tokens": 35130491.0, + "reward": 1.29296875, + "reward_std": 0.9099084287881851, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.4818056970834732, + "rewards/tag_count_reward/mean": 0.66796875, + "rewards/tag_count_reward/std": 0.4368480145931244, + "step": 1133, + "token_counts/after_target": 1196.0, + "token_counts/after_think": 17.0, + "token_counts/before_target": 2273.25, + "token_counts/before_think": 709.0 + }, + { + "avg_penalty/after_target": 2.38667368888855, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.45088256150484085, + "avg_penalty/before_think": 0.4099964611232281, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 698.0, + "completions/max_terminated_length": 698.0, + "completions/mean_length": 270.125, + "completions/mean_terminated_length": 270.125, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.567, + "grad_norm": 10.227078437805176, + "kl": 15.734375, + "learning_rate": 9.424359730404329e-06, + "loss": 1.6214, + "num_tokens": 35161827.0, + "reward": 1.421875, + "reward_std": 0.8950642049312592, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.48148179799318314, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.4214957132935524, + "step": 1134, + "token_counts/after_target": 913.5, + "token_counts/after_think": 67.0, + "token_counts/before_target": 2314.5, + "token_counts/before_think": 1027.0 + }, + { + "avg_penalty/after_target": 2.1316996812820435, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4753785729408264, + "avg_penalty/before_think": 0.8355093523859978, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 718.25, + "completions/max_terminated_length": 718.25, + "completions/mean_length": 327.90625, + "completions/mean_terminated_length": 327.90625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.5675, + "grad_norm": 4.235138416290283, + "kl": 20.375, + "learning_rate": 9.406936264240386e-06, + "loss": 1.8619, + "num_tokens": 35194205.0, + "reward": 1.25, + "reward_std": 0.938416063785553, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.5061737895011902, + "rewards/tag_count_reward/mean": 0.65625, + "rewards/tag_count_reward/std": 0.4557069465517998, + "step": 1135, + "token_counts/after_target": 1136.5, + "token_counts/after_think": 195.75, + "token_counts/before_target": 2798.0, + "token_counts/before_think": 1116.25 + }, + { + "avg_penalty/after_target": 2.1391239762306213, + "avg_penalty/after_think": 3.925626218318939, + "avg_penalty/before_target": 0.4745098575949669, + "avg_penalty/before_think": 0.580150656402111, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 779.5, + "completions/max_terminated_length": 683.0, + "completions/mean_length": 291.265625, + "completions/mean_terminated_length": 279.98958587646484, + "completions/min_length": 53.5, + "completions/min_terminated_length": 53.5, + "epoch": 0.568, + "grad_norm": 8.706633567810059, + "kl": 13.3359375, + "learning_rate": 9.38951460465143e-06, + "loss": 1.482, + "num_tokens": 35222126.0, + "reward": 1.58984375, + "reward_std": 0.79423588514328, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.3956565484404564, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.37016743421554565, + "step": 1136, + "token_counts/after_target": 1011.25, + "token_counts/after_think": 49.25, + "token_counts/before_target": 2638.75, + "token_counts/before_think": 961.0 + }, + { + "avg_penalty/after_target": 2.482598155736923, + "avg_penalty/after_think": 2.4347307682037354, + "avg_penalty/before_target": 0.4155122935771942, + "avg_penalty/before_think": 0.5432829186320305, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.5, + "completions/max_terminated_length": 603.5, + "completions/mean_length": 275.859375, + "completions/mean_terminated_length": 275.859375, + "completions/min_length": 81.75, + "completions/min_terminated_length": 81.75, + "epoch": 0.5685, + "grad_norm": 4.7498931884765625, + "kl": 18.0625, + "learning_rate": 9.372094804706867e-06, + "loss": 1.5849, + "num_tokens": 35249701.0, + "reward": 1.34375, + "reward_std": 0.859884187579155, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.46875541657209396, + "rewards/tag_count_reward/mean": 0.6875, + "rewards/tag_count_reward/std": 0.41148635745048523, + "step": 1137, + "token_counts/after_target": 827.25, + "token_counts/after_think": 33.25, + "token_counts/before_target": 2760.0, + "token_counts/before_think": 793.25 + }, + { + "avg_penalty/after_target": 2.97134131193161, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.4005269631743431, + "avg_penalty/before_think": 0.46014445275068283, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 241.234375, + "completions/mean_terminated_length": 241.234375, + "completions/min_length": 50.5, + "completions/min_terminated_length": 50.5, + "epoch": 0.569, + "grad_norm": 7.2914276123046875, + "kl": 22.0625, + "learning_rate": 9.354676917470421e-06, + "loss": 2.0426, + "num_tokens": 35278628.0, + "reward": 1.37890625, + "reward_std": 0.9333416521549225, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.11180340498685837, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.4909028485417366, + "rewards/tag_count_reward/mean": 0.69140625, + "rewards/tag_count_reward/std": 0.41367993503808975, + "step": 1138, + "token_counts/after_target": 1030.5, + "token_counts/after_think": 56.0, + "token_counts/before_target": 2140.0, + "token_counts/before_think": 633.25 + }, + { + "avg_penalty/after_target": 3.381391167640686, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.38393034040927887, + "avg_penalty/before_think": 0.3940102756023407, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 216.359375, + "completions/mean_terminated_length": 216.359375, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.5695, + "grad_norm": 8.664168357849121, + "kl": 18.1875, + "learning_rate": 9.337260996000002e-06, + "loss": 1.8209, + "num_tokens": 35301739.0, + "reward": 1.4296875, + "reward_std": 0.8848578035831451, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4459725022315979, + "rewards/tag_count_reward/mean": 0.7109375, + "rewards/tag_count_reward/std": 0.42282112687826157, + "step": 1139, + "token_counts/after_target": 793.75, + "token_counts/after_think": 124.0, + "token_counts/before_target": 1762.75, + "token_counts/before_think": 781.25 + }, + { + "avg_penalty/after_target": 2.7505295276641846, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.2735484875738621, + "avg_penalty/before_think": 0.4515932574868202, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 214.390625, + "completions/mean_terminated_length": 214.390625, + "completions/min_length": 30.25, + "completions/min_terminated_length": 30.25, + "epoch": 0.57, + "grad_norm": 15.108866691589355, + "kl": 26.53125, + "learning_rate": 9.319847093347522e-06, + "loss": 1.7683, + "num_tokens": 35326292.0, + "reward": 1.2578125, + "reward_std": 1.022889256477356, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.125, + "rewards/format_reward/mean": 0.546875, + "rewards/format_reward/std": 0.498777836561203, + "rewards/tag_count_reward/mean": 0.6171875, + "rewards/tag_count_reward/std": 0.45973049104213715, + "step": 1140, + "token_counts/after_target": 568.75, + "token_counts/after_think": 11.0, + "token_counts/before_target": 2067.0, + "token_counts/before_think": 783.5 + }, + { + "avg_penalty/after_target": 2.8893730342388153, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3665856346487999, + "avg_penalty/before_think": 0.3436776325106621, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 595.25, + "completions/max_terminated_length": 505.5, + "completions/mean_length": 213.453125, + "completions/mean_terminated_length": 200.79687881469727, + "completions/min_length": 50.5, + "completions/min_terminated_length": 50.5, + "epoch": 0.5705, + "grad_norm": 8.403875350952148, + "kl": 26.09375, + "learning_rate": 9.302435262558748e-06, + "loss": 2.0558, + "num_tokens": 35351233.0, + "reward": 1.35546875, + "reward_std": 0.9132920503616333, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.48456869274377823, + "rewards/tag_count_reward/mean": 0.69921875, + "rewards/tag_count_reward/std": 0.43984176963567734, + "step": 1141, + "token_counts/after_target": 815.75, + "token_counts/after_think": 95.75, + "token_counts/before_target": 1840.75, + "token_counts/before_think": 663.0 + }, + { + "avg_penalty/after_target": 1.824087679386139, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.44242746382951736, + "avg_penalty/before_think": 0.6070660054683685, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.75, + "completions/max_terminated_length": 614.75, + "completions/mean_length": 215.640625, + "completions/mean_terminated_length": 215.640625, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.571, + "grad_norm": 14.474445343017578, + "kl": 30.875, + "learning_rate": 9.285025556673141e-06, + "loss": 2.2326, + "num_tokens": 35376298.0, + "reward": 1.4140625, + "reward_std": 0.874741405248642, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4581565484404564, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.42466603964567184, + "step": 1142, + "token_counts/after_target": 565.5, + "token_counts/after_think": 84.25, + "token_counts/before_target": 2263.5, + "token_counts/before_think": 537.0 + }, + { + "avg_penalty/after_target": 2.337410569190979, + "avg_penalty/after_think": 2.936623454093933, + "avg_penalty/before_target": 0.4168012887239456, + "avg_penalty/before_think": 0.45679087936878204, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 162.890625, + "completions/mean_terminated_length": 162.890625, + "completions/min_length": 49.75, + "completions/min_terminated_length": 49.75, + "epoch": 0.5715, + "grad_norm": 6.122369766235352, + "kl": 23.125, + "learning_rate": 9.267618028723687e-06, + "loss": 1.911, + "num_tokens": 35395939.0, + "reward": 1.5390625, + "reward_std": 0.8152015507221222, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4154609143733978, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.4028930887579918, + "step": 1143, + "token_counts/after_target": 427.0, + "token_counts/after_think": 20.5, + "token_counts/before_target": 1497.25, + "token_counts/before_think": 661.5 + }, + { + "avg_penalty/after_target": 2.3402577340602875, + "avg_penalty/after_think": 2.55545711517334, + "avg_penalty/before_target": 0.5156879797577858, + "avg_penalty/before_think": 0.45548736304044724, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 795.25, + "completions/max_terminated_length": 536.5, + "completions/mean_length": 223.984375, + "completions/mean_terminated_length": 198.8947982788086, + "completions/min_length": 59.5, + "completions/min_terminated_length": 59.5, + "epoch": 0.572, + "grad_norm": 14.43188762664795, + "kl": 32.5, + "learning_rate": 9.250212731736726e-06, + "loss": 2.3973, + "num_tokens": 35421586.0, + "reward": 1.28515625, + "reward_std": 0.9108837246894836, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.48866813629865646, + "rewards/tag_count_reward/mean": 0.66015625, + "rewards/tag_count_reward/std": 0.4389706030488014, + "step": 1144, + "token_counts/after_target": 825.25, + "token_counts/after_think": 31.5, + "token_counts/before_target": 2126.5, + "token_counts/before_think": 600.5 + }, + { + "avg_penalty/after_target": 1.6036575138568878, + "avg_penalty/after_think": 2.8775933384895325, + "avg_penalty/before_target": 0.4954012408852577, + "avg_penalty/before_think": 0.43840737640857697, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 575.0, + "completions/max_terminated_length": 575.0, + "completions/mean_length": 184.53125, + "completions/mean_terminated_length": 184.53125, + "completions/min_length": 58.25, + "completions/min_terminated_length": 58.25, + "epoch": 0.5725, + "grad_norm": 8.013411521911621, + "kl": 27.625, + "learning_rate": 9.232809718731815e-06, + "loss": 2.1763, + "num_tokens": 35449156.0, + "reward": 1.49609375, + "reward_std": 0.8170228749513626, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4339347705245018, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.3925963416695595, + "step": 1145, + "token_counts/after_target": 552.75, + "token_counts/after_think": 78.25, + "token_counts/before_target": 1714.0, + "token_counts/before_think": 607.5 + }, + { + "avg_penalty/after_target": 2.885169208049774, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3751804456114769, + "avg_penalty/before_think": 0.41529788821935654, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 691.25, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 212.5625, + "completions/mean_terminated_length": 187.1927146911621, + "completions/min_length": 39.5, + "completions/min_terminated_length": 39.5, + "epoch": 0.573, + "grad_norm": 8.60291576385498, + "kl": 28.03125, + "learning_rate": 9.215409042721553e-06, + "loss": 2.2049, + "num_tokens": 35476664.0, + "reward": 1.4375, + "reward_std": 0.8473509103059769, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4534844756126404, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.405112661421299, + "step": 1146, + "token_counts/after_target": 739.25, + "token_counts/after_think": 30.0, + "token_counts/before_target": 2067.0, + "token_counts/before_think": 564.75 + }, + { + "avg_penalty/after_target": 2.403749644756317, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5719042718410492, + "avg_penalty/before_think": 0.6882153451442719, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 840.75, + "completions/max_terminated_length": 614.0, + "completions/mean_length": 292.46875, + "completions/mean_terminated_length": 254.7128028869629, + "completions/min_length": 35.25, + "completions/min_terminated_length": 35.25, + "epoch": 0.5735, + "grad_norm": 7.702539920806885, + "kl": 29.15625, + "learning_rate": 9.198010756711413e-06, + "loss": 2.4765, + "num_tokens": 35504550.0, + "reward": 1.3671875, + "reward_std": 0.9159492403268814, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.479247085750103, + "rewards/tag_count_reward/mean": 0.6953125, + "rewards/tag_count_reward/std": 0.44463978707790375, + "step": 1147, + "token_counts/after_target": 1376.5, + "token_counts/after_think": 27.75, + "token_counts/before_target": 2602.5, + "token_counts/before_think": 672.75 + }, + { + "avg_penalty/after_target": 3.0996667444705963, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.2637779265642166, + "avg_penalty/before_think": 0.4241897761821747, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.5, + "completions/max_terminated_length": 431.5, + "completions/mean_length": 175.3125, + "completions/mean_terminated_length": 175.3125, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.574, + "grad_norm": 2.528181791305542, + "kl": 18.96875, + "learning_rate": 9.180614913699593e-06, + "loss": 1.6177, + "num_tokens": 35525562.0, + "reward": 1.58203125, + "reward_std": 0.7666306793689728, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4255262687802315, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.3564481735229492, + "step": 1148, + "token_counts/after_target": 357.25, + "token_counts/after_think": 30.25, + "token_counts/before_target": 1802.75, + "token_counts/before_think": 614.75 + }, + { + "avg_penalty/after_target": 2.0942935049533844, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.2889907471835613, + "avg_penalty/before_think": 0.41106345504522324, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.5, + "completions/max_terminated_length": 549.5, + "completions/mean_length": 197.984375, + "completions/mean_terminated_length": 197.984375, + "completions/min_length": 42.5, + "completions/min_terminated_length": 42.5, + "epoch": 0.5745, + "grad_norm": 4.5581159591674805, + "kl": 21.3125, + "learning_rate": 9.163221566676847e-06, + "loss": 1.6367, + "num_tokens": 35547705.0, + "reward": 1.44140625, + "reward_std": 0.8807009756565094, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.46513500809669495, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.4224105179309845, + "step": 1149, + "token_counts/after_target": 345.0, + "token_counts/after_think": 37.0, + "token_counts/before_target": 1992.5, + "token_counts/before_think": 793.25 + }, + { + "avg_penalty/after_target": 2.9835516810417175, + "avg_penalty/after_think": 1.9712134003639221, + "avg_penalty/before_target": 0.3808690197765827, + "avg_penalty/before_think": 0.5038957074284554, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 739.75, + "completions/max_terminated_length": 594.25, + "completions/mean_length": 252.46875, + "completions/mean_terminated_length": 239.9864616394043, + "completions/min_length": 66.75, + "completions/min_terminated_length": 66.75, + "epoch": 0.575, + "grad_norm": 11.903858184814453, + "kl": 16.4921875, + "learning_rate": 9.145830768626326e-06, + "loss": 1.8003, + "num_tokens": 35573687.0, + "reward": 1.578125, + "reward_std": 0.8150507360696793, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42867646366357803, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.3845718652009964, + "step": 1150, + "token_counts/after_target": 1018.75, + "token_counts/after_think": 26.75, + "token_counts/before_target": 1979.25, + "token_counts/before_think": 1014.75 + }, + { + "avg_penalty/after_target": 2.22599333524704, + "avg_penalty/after_think": 2.8130863308906555, + "avg_penalty/before_target": 0.3232031464576721, + "avg_penalty/before_think": 0.9169310703873634, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 620.25, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 197.40625, + "completions/mean_terminated_length": 184.45729446411133, + "completions/min_length": 36.5, + "completions/min_terminated_length": 36.5, + "epoch": 0.5755, + "grad_norm": 5.124884605407715, + "kl": 26.40625, + "learning_rate": 9.128442572523418e-06, + "loss": 2.0042, + "num_tokens": 35594257.0, + "reward": 0.99609375, + "reward_std": 0.7585897445678711, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.46875, + "rewards/format_reward/std": 0.4194520115852356, + "rewards/tag_count_reward/mean": 0.52734375, + "rewards/tag_count_reward/std": 0.36143942177295685, + "step": 1151, + "token_counts/after_target": 550.5, + "token_counts/after_think": 16.5, + "token_counts/before_target": 2176.5, + "token_counts/before_think": 415.0 + }, + { + "avg_penalty/after_target": 2.3107215464115143, + "avg_penalty/after_think": 2.5426318645477295, + "avg_penalty/before_target": 0.42414354160428047, + "avg_penalty/before_think": 0.47367703914642334, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 632.25, + "completions/max_terminated_length": 632.25, + "completions/mean_length": 205.5, + "completions/mean_terminated_length": 205.5, + "completions/min_length": 60.5, + "completions/min_terminated_length": 60.5, + "epoch": 0.576, + "grad_norm": 9.570795059204102, + "kl": 17.515625, + "learning_rate": 9.111057031335586e-06, + "loss": 1.7651, + "num_tokens": 35615681.0, + "reward": 1.5234375, + "reward_std": 0.8199173510074615, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4260597825050354, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.40389713644981384, + "step": 1152, + "token_counts/after_target": 607.75, + "token_counts/after_think": 48.25, + "token_counts/before_target": 1717.25, + "token_counts/before_think": 914.75 + }, + { + "avg_penalty/after_target": 3.1507621705532074, + "avg_penalty/after_think": 2.7735301852226257, + "avg_penalty/before_target": 0.3866240307688713, + "avg_penalty/before_think": 0.6138686686754227, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 643.0, + "completions/max_terminated_length": 643.0, + "completions/mean_length": 257.5625, + "completions/mean_terminated_length": 257.5625, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.5765, + "grad_norm": 11.22321605682373, + "kl": 16.0625, + "learning_rate": 9.093674198022201e-06, + "loss": 1.5978, + "num_tokens": 35640661.0, + "reward": 1.44921875, + "reward_std": 0.9347531497478485, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4704566150903702, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.4264826029539108, + "step": 1153, + "token_counts/after_target": 774.0, + "token_counts/after_think": 55.75, + "token_counts/before_target": 2326.0, + "token_counts/before_think": 965.25 + }, + { + "avg_penalty/after_target": 3.3562833070755005, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4188139736652374, + "avg_penalty/before_think": 0.5089957863092422, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 647.5, + "completions/max_terminated_length": 608.5, + "completions/mean_length": 236.5625, + "completions/mean_terminated_length": 224.2583351135254, + "completions/min_length": 67.25, + "completions/min_terminated_length": 67.25, + "epoch": 0.577, + "grad_norm": 22.364334106445312, + "kl": 11.875, + "learning_rate": 9.076294125534382e-06, + "loss": 1.79, + "num_tokens": 35667065.0, + "reward": 1.6875, + "reward_std": 0.7286049574613571, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36797718703746796, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.3618217036128044, + "step": 1154, + "token_counts/after_target": 1017.75, + "token_counts/after_think": 30.0, + "token_counts/before_target": 2001.5, + "token_counts/before_think": 735.75 + }, + { + "avg_penalty/after_target": 2.7557106614112854, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3618709146976471, + "avg_penalty/before_think": 0.37339945137500763, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.0, + "completions/max_terminated_length": 593.0, + "completions/mean_length": 199.765625, + "completions/mean_terminated_length": 199.765625, + "completions/min_length": 45.25, + "completions/min_terminated_length": 45.25, + "epoch": 0.5775, + "grad_norm": 17.583288192749023, + "kl": 17.484375, + "learning_rate": 9.058916866814857e-06, + "loss": 1.904, + "num_tokens": 35690106.0, + "reward": 1.4453125, + "reward_std": 0.8741140514612198, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.44495995342731476, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.43008097261190414, + "step": 1155, + "token_counts/after_target": 810.0, + "token_counts/after_think": 18.0, + "token_counts/before_target": 1591.5, + "token_counts/before_think": 776.75 + }, + { + "avg_penalty/after_target": 2.758955240249634, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.24697205051779747, + "avg_penalty/before_think": 0.38561301678419113, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 203.59375, + "completions/mean_terminated_length": 203.59375, + "completions/min_length": 48.25, + "completions/min_terminated_length": 48.25, + "epoch": 0.578, + "grad_norm": 9.063862800598145, + "kl": 13.59375, + "learning_rate": 9.04154247479776e-06, + "loss": 1.346, + "num_tokens": 35713792.0, + "reward": 1.53125, + "reward_std": 0.8294303566217422, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.43655145168304443, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.399491548538208, + "step": 1156, + "token_counts/after_target": 466.25, + "token_counts/after_think": 12.75, + "token_counts/before_target": 2073.5, + "token_counts/before_think": 705.0 + }, + { + "avg_penalty/after_target": 2.8565260469913483, + "avg_penalty/after_think": 3.7626718878746033, + "avg_penalty/before_target": 0.2697886750102043, + "avg_penalty/before_think": 0.45598509907722473, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.5, + "completions/max_terminated_length": 427.5, + "completions/mean_length": 136.4375, + "completions/mean_terminated_length": 136.4375, + "completions/min_length": 45.25, + "completions/min_terminated_length": 45.25, + "epoch": 0.5785, + "grad_norm": 12.93287181854248, + "kl": 13.6328125, + "learning_rate": 9.024171002408507e-06, + "loss": 1.5821, + "num_tokens": 35732236.0, + "reward": 1.64453125, + "reward_std": 0.7093949317932129, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3811737820506096, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.33570220321416855, + "step": 1157, + "token_counts/after_target": 361.5, + "token_counts/after_think": 25.0, + "token_counts/before_target": 1271.25, + "token_counts/before_think": 525.25 + }, + { + "avg_penalty/after_target": 2.2962554693222046, + "avg_penalty/after_think": 2.8446889519691467, + "avg_penalty/before_target": 0.4470776207745075, + "avg_penalty/before_think": 0.45759257674217224, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 728.25, + "completions/max_terminated_length": 632.25, + "completions/mean_length": 219.140625, + "completions/mean_terminated_length": 208.01563262939453, + "completions/min_length": 56.25, + "completions/min_terminated_length": 56.25, + "epoch": 0.579, + "grad_norm": 10.157465934753418, + "kl": 19.3125, + "learning_rate": 9.006802502563613e-06, + "loss": 1.9503, + "num_tokens": 35756773.0, + "reward": 1.5078125, + "reward_std": 0.829485684633255, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4260597825050354, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.41066307574510574, + "step": 1158, + "token_counts/after_target": 874.5, + "token_counts/after_think": 58.75, + "token_counts/before_target": 1888.25, + "token_counts/before_think": 684.75 + }, + { + "avg_penalty/after_target": 2.8800521194934845, + "avg_penalty/after_think": 2.571906268596649, + "avg_penalty/before_target": 0.30961349979043007, + "avg_penalty/before_think": 0.483786940574646, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.75, + "completions/max_terminated_length": 492.75, + "completions/mean_length": 192.53125, + "completions/mean_terminated_length": 192.53125, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.5795, + "grad_norm": 3.5960471630096436, + "kl": 14.4609375, + "learning_rate": 8.989437028170537e-06, + "loss": 1.3399, + "num_tokens": 35778039.0, + "reward": 1.5546875, + "reward_std": 0.7719491273164749, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.40316852182149887, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.37349415570497513, + "step": 1159, + "token_counts/after_target": 363.5, + "token_counts/after_think": 27.0, + "token_counts/before_target": 1853.0, + "token_counts/before_think": 837.0 + }, + { + "avg_penalty/after_target": 3.0182279348373413, + "avg_penalty/after_think": 3.7808828353881836, + "avg_penalty/before_target": 0.27850472182035446, + "avg_penalty/before_think": 0.3785399720072746, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.75, + "completions/max_terminated_length": 530.75, + "completions/mean_length": 173.4375, + "completions/mean_terminated_length": 173.4375, + "completions/min_length": 62.5, + "completions/min_terminated_length": 62.5, + "epoch": 0.58, + "grad_norm": 3.7443530559539795, + "kl": 18.296875, + "learning_rate": 8.972074632127533e-06, + "loss": 1.6857, + "num_tokens": 35802851.0, + "reward": 1.62890625, + "reward_std": 0.7721518352627754, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4101393073797226, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.34609660506248474, + "step": 1160, + "token_counts/after_target": 278.5, + "token_counts/after_think": 101.25, + "token_counts/before_target": 1813.75, + "token_counts/before_think": 581.5 + }, + { + "avg_penalty/after_target": 3.229632556438446, + "avg_penalty/after_think": 3.852670133113861, + "avg_penalty/before_target": 0.3097272887825966, + "avg_penalty/before_think": 0.5398079380393028, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.25, + "completions/max_terminated_length": 441.25, + "completions/mean_length": 157.421875, + "completions/mean_terminated_length": 157.421875, + "completions/min_length": 48.25, + "completions/min_terminated_length": 48.25, + "epoch": 0.5805, + "grad_norm": 6.378709316253662, + "kl": 13.46875, + "learning_rate": 8.954715367323468e-06, + "loss": 1.3495, + "num_tokens": 35826910.0, + "reward": 1.62109375, + "reward_std": 0.751412034034729, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4097762927412987, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.35179100185632706, + "step": 1161, + "token_counts/after_target": 382.5, + "token_counts/after_think": 49.0, + "token_counts/before_target": 1311.0, + "token_counts/before_think": 776.25 + }, + { + "avg_penalty/after_target": 2.514795333147049, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.31773691624403, + "avg_penalty/before_think": 0.5857430621981621, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 182.53125, + "completions/mean_terminated_length": 182.53125, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.581, + "grad_norm": 5.923469543457031, + "kl": 16.7265625, + "learning_rate": 8.937359286637672e-06, + "loss": 1.2942, + "num_tokens": 35848944.0, + "reward": 1.51171875, + "reward_std": 0.7973930537700653, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4493217319250107, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.376026950776577, + "step": 1162, + "token_counts/after_target": 425.0, + "token_counts/after_think": 92.0, + "token_counts/before_target": 1572.75, + "token_counts/before_think": 830.75 + }, + { + "avg_penalty/after_target": 2.4796923100948334, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.36136600002646446, + "avg_penalty/before_think": 0.37866565585136414, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.5, + "completions/max_terminated_length": 568.5, + "completions/mean_length": 169.578125, + "completions/mean_terminated_length": 169.578125, + "completions/min_length": 25.75, + "completions/min_terminated_length": 25.75, + "epoch": 0.5815, + "grad_norm": 19.707908630371094, + "kl": 21.875, + "learning_rate": 8.920006442939772e-06, + "loss": 1.3787, + "num_tokens": 35867493.0, + "reward": 1.3828125, + "reward_std": 0.8850703835487366, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.48456869274377823, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.41558507829904556, + "step": 1163, + "token_counts/after_target": 330.5, + "token_counts/after_think": 72.25, + "token_counts/before_target": 1487.25, + "token_counts/before_think": 823.25 + }, + { + "avg_penalty/after_target": 2.534854769706726, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3616954833269119, + "avg_penalty/before_think": 0.40602656826376915, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.25, + "completions/max_terminated_length": 469.25, + "completions/mean_length": 175.6875, + "completions/mean_terminated_length": 175.6875, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.582, + "grad_norm": 6.358192443847656, + "kl": 15.96875, + "learning_rate": 8.902656889089548e-06, + "loss": 1.3094, + "num_tokens": 35886561.0, + "reward": 1.6484375, + "reward_std": 0.6856121271848679, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.8671875, + "rewards/tag_count_reward/std": 0.29867855086922646, + "step": 1164, + "token_counts/after_target": 567.0, + "token_counts/after_think": 10.75, + "token_counts/before_target": 1496.5, + "token_counts/before_think": 736.75 + }, + { + "avg_penalty/after_target": 3.208638072013855, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.30162273719906807, + "avg_penalty/before_think": 0.5560629293322563, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.5, + "completions/max_terminated_length": 506.5, + "completions/mean_length": 181.671875, + "completions/mean_terminated_length": 181.671875, + "completions/min_length": 38.75, + "completions/min_terminated_length": 38.75, + "epoch": 0.5825, + "grad_norm": 15.088048934936523, + "kl": 23.71875, + "learning_rate": 8.885310677936746e-06, + "loss": 1.6408, + "num_tokens": 35914188.0, + "reward": 1.44921875, + "reward_std": 0.8771105706691742, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.73046875, + "rewards/tag_count_reward/std": 0.4257132261991501, + "step": 1165, + "token_counts/after_target": 379.25, + "token_counts/after_think": 45.25, + "token_counts/before_target": 1674.0, + "token_counts/before_think": 808.25 + }, + { + "avg_penalty/after_target": 3.132361114025116, + "avg_penalty/after_think": 2.8387157917022705, + "avg_penalty/before_target": 0.2233952134847641, + "avg_penalty/before_think": 0.4394182041287422, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.5, + "completions/max_terminated_length": 360.5, + "completions/mean_length": 159.515625, + "completions/mean_terminated_length": 159.515625, + "completions/min_length": 37.25, + "completions/min_terminated_length": 37.25, + "epoch": 0.583, + "grad_norm": 10.238909721374512, + "kl": 15.220703125, + "learning_rate": 8.867967862320935e-06, + "loss": 0.9535, + "num_tokens": 35934349.0, + "reward": 1.46875, + "reward_std": 0.8586337566375732, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4598134011030197, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.4084940031170845, + "step": 1166, + "token_counts/after_target": 208.75, + "token_counts/after_think": 42.0, + "token_counts/before_target": 1553.0, + "token_counts/before_think": 748.5 + }, + { + "avg_penalty/after_target": 3.1879568696022034, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.32339154183864594, + "avg_penalty/before_think": 0.5032496675848961, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 165.375, + "completions/mean_terminated_length": 165.375, + "completions/min_length": 36.25, + "completions/min_terminated_length": 36.25, + "epoch": 0.5835, + "grad_norm": 8.2969970703125, + "kl": 16.34765625, + "learning_rate": 8.850628495071336e-06, + "loss": 1.3215, + "num_tokens": 35955973.0, + "reward": 1.546875, + "reward_std": 0.7730246484279633, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4106728211045265, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.37397947907447815, + "step": 1167, + "token_counts/after_target": 450.75, + "token_counts/after_think": 70.75, + "token_counts/before_target": 1569.5, + "token_counts/before_think": 555.0 + }, + { + "avg_penalty/after_target": 2.948948919773102, + "avg_penalty/after_think": 3.0801445841789246, + "avg_penalty/before_target": 0.3047230429947376, + "avg_penalty/before_think": 0.4292188286781311, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.0, + "completions/max_terminated_length": 614.0, + "completions/mean_length": 172.703125, + "completions/mean_terminated_length": 172.703125, + "completions/min_length": 31.5, + "completions/min_terminated_length": 31.5, + "epoch": 0.584, + "grad_norm": 8.367497444152832, + "kl": 21.78125, + "learning_rate": 8.833292629006669e-06, + "loss": 1.6567, + "num_tokens": 35984562.0, + "reward": 1.54296875, + "reward_std": 0.7529911696910858, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4185478091239929, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.3587941601872444, + "step": 1168, + "token_counts/after_target": 515.75, + "token_counts/after_think": 38.5, + "token_counts/before_target": 1507.25, + "token_counts/before_think": 701.75 + }, + { + "avg_penalty/after_target": 1.7688559293746948, + "avg_penalty/after_think": 3.633986711502075, + "avg_penalty/before_target": 0.42656102031469345, + "avg_penalty/before_think": 0.4062216281890869, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.5, + "completions/max_terminated_length": 512.5, + "completions/mean_length": 186.53125, + "completions/mean_terminated_length": 186.53125, + "completions/min_length": 52.75, + "completions/min_terminated_length": 52.75, + "epoch": 0.5845, + "grad_norm": 3.110363006591797, + "kl": 16.796875, + "learning_rate": 8.815960316934991e-06, + "loss": 1.4688, + "num_tokens": 36008980.0, + "reward": 1.62890625, + "reward_std": 0.7493261694908142, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4141380712389946, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.34967076033353806, + "step": 1169, + "token_counts/after_target": 488.0, + "token_counts/after_think": 48.5, + "token_counts/before_target": 1611.75, + "token_counts/before_think": 836.25 + }, + { + "avg_penalty/after_target": 3.312466025352478, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.35478750616312027, + "avg_penalty/before_think": 0.39214449375867844, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.75, + "completions/max_terminated_length": 508.75, + "completions/mean_length": 176.296875, + "completions/mean_terminated_length": 176.296875, + "completions/min_length": 44.25, + "completions/min_terminated_length": 44.25, + "epoch": 0.585, + "grad_norm": 3.6629903316497803, + "kl": 19.40625, + "learning_rate": 8.79863161165353e-06, + "loss": 1.6204, + "num_tokens": 36032903.0, + "reward": 1.51953125, + "reward_std": 0.7988856881856918, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4339347705245018, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.3791440427303314, + "step": 1170, + "token_counts/after_target": 485.75, + "token_counts/after_think": 34.5, + "token_counts/before_target": 1386.5, + "token_counts/before_think": 914.0 + }, + { + "avg_penalty/after_target": 2.9245410561561584, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.42413976415991783, + "avg_penalty/before_think": 0.6184735968708992, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 653.25, + "completions/max_terminated_length": 623.25, + "completions/mean_length": 225.21875, + "completions/mean_terminated_length": 212.5031280517578, + "completions/min_length": 47.5, + "completions/min_terminated_length": 47.5, + "epoch": 0.5855, + "grad_norm": 3.4173200130462646, + "kl": 20.828125, + "learning_rate": 8.781306565948528e-06, + "loss": 1.9059, + "num_tokens": 36057797.0, + "reward": 1.61328125, + "reward_std": 0.7731102705001831, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4101393073797226, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.35613933205604553, + "step": 1171, + "token_counts/after_target": 810.5, + "token_counts/after_think": 131.5, + "token_counts/before_target": 1654.25, + "token_counts/before_think": 1007.25 + }, + { + "avg_penalty/after_target": 3.4598255157470703, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.2952097989618778, + "avg_penalty/before_think": 0.4236121401190758, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 648.75, + "completions/max_terminated_length": 648.75, + "completions/mean_length": 227.765625, + "completions/mean_terminated_length": 227.765625, + "completions/min_length": 69.75, + "completions/min_terminated_length": 69.75, + "epoch": 0.586, + "grad_norm": 3.77057147026062, + "kl": 22.21875, + "learning_rate": 8.763985232595076e-06, + "loss": 1.8921, + "num_tokens": 36081126.0, + "reward": 1.21875, + "reward_std": 0.70369091629982, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.35648179799318314, + "rewards/tag_count_reward/mean": 0.609375, + "rewards/tag_count_reward/std": 0.34966348111629486, + "step": 1172, + "token_counts/after_target": 864.75, + "token_counts/after_think": 26.75, + "token_counts/before_target": 1977.5, + "token_counts/before_think": 775.25 + }, + { + "avg_penalty/after_target": 2.1816755831241608, + "avg_penalty/after_think": 3.850954055786133, + "avg_penalty/before_target": 0.64537513256073, + "avg_penalty/before_think": 0.3867745101451874, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.25, + "completions/max_terminated_length": 580.25, + "completions/mean_length": 181.109375, + "completions/mean_terminated_length": 181.109375, + "completions/min_length": 41.75, + "completions/min_terminated_length": 41.75, + "epoch": 0.5865, + "grad_norm": 6.567968368530273, + "kl": 25.078125, + "learning_rate": 8.746667664356957e-06, + "loss": 2.2695, + "num_tokens": 36101517.0, + "reward": 1.50390625, + "reward_std": 0.8371373414993286, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4414467439055443, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.40616950392723083, + "step": 1173, + "token_counts/after_target": 637.25, + "token_counts/after_think": 34.75, + "token_counts/before_target": 1526.0, + "token_counts/before_think": 699.75 + }, + { + "avg_penalty/after_target": 2.737920731306076, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3706624135375023, + "avg_penalty/before_think": 0.3743384890258312, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 808.25, + "completions/max_terminated_length": 808.25, + "completions/mean_length": 187.25, + "completions/mean_terminated_length": 187.25, + "completions/min_length": 51.25, + "completions/min_terminated_length": 51.25, + "epoch": 0.587, + "grad_norm": 3.613450050354004, + "kl": 19.9453125, + "learning_rate": 8.729353913986495e-06, + "loss": 1.6254, + "num_tokens": 36123789.0, + "reward": 1.61328125, + "reward_std": 0.7611464262008667, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.36863839626312256, + "step": 1174, + "token_counts/after_target": 434.25, + "token_counts/after_think": 22.5, + "token_counts/before_target": 1987.0, + "token_counts/before_think": 552.25 + }, + { + "avg_penalty/after_target": 2.4950341284275055, + "avg_penalty/after_think": 3.7800358533859253, + "avg_penalty/before_target": 0.3102097660303116, + "avg_penalty/before_think": 0.5022474601864815, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.25, + "completions/max_terminated_length": 551.25, + "completions/mean_length": 214.09375, + "completions/mean_terminated_length": 214.09375, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.5875, + "grad_norm": 10.589685440063477, + "kl": 8.97265625, + "learning_rate": 8.712044034224374e-06, + "loss": 1.1541, + "num_tokens": 36147315.0, + "reward": 1.765625, + "reward_std": 0.5574804693460464, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3375816270709038, + "rewards/tag_count_reward/mean": 0.90625, + "rewards/tag_count_reward/std": 0.22746434807777405, + "step": 1175, + "token_counts/after_target": 305.75, + "token_counts/after_think": 41.5, + "token_counts/before_target": 1312.0, + "token_counts/before_think": 1766.25 + }, + { + "avg_penalty/after_target": 2.7322935461997986, + "avg_penalty/after_think": 3.891383945941925, + "avg_penalty/before_target": 0.37844184413552284, + "avg_penalty/before_think": 0.6669133901596069, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.75, + "completions/max_terminated_length": 577.75, + "completions/mean_length": 208.765625, + "completions/mean_terminated_length": 208.765625, + "completions/min_length": 51.25, + "completions/min_terminated_length": 51.25, + "epoch": 0.588, + "grad_norm": 10.345643043518066, + "kl": 21.625, + "learning_rate": 8.694738077799487e-06, + "loss": 2.0803, + "num_tokens": 36170660.0, + "reward": 1.58984375, + "reward_std": 0.7830553948879242, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42733466625213623, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.35854072123765945, + "step": 1176, + "token_counts/after_target": 770.25, + "token_counts/after_think": 77.75, + "token_counts/before_target": 1679.0, + "token_counts/before_think": 813.25 + }, + { + "avg_penalty/after_target": 3.029648095369339, + "avg_penalty/after_think": 3.69070702791214, + "avg_penalty/before_target": 0.44497932493686676, + "avg_penalty/before_think": 0.40240994840860367, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 566.25, + "completions/max_terminated_length": 458.25, + "completions/mean_length": 180.546875, + "completions/mean_terminated_length": 167.90625381469727, + "completions/min_length": 43.5, + "completions/min_terminated_length": 43.5, + "epoch": 0.5885, + "grad_norm": 7.042144775390625, + "kl": 13.83203125, + "learning_rate": 8.677436097428775e-06, + "loss": 1.3693, + "num_tokens": 36190279.0, + "reward": 1.6875, + "reward_std": 0.696221336722374, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38772592693567276, + "rewards/tag_count_reward/mean": 0.859375, + "rewards/tag_count_reward/std": 0.3153250552713871, + "step": 1177, + "token_counts/after_target": 608.0, + "token_counts/after_think": 54.75, + "token_counts/before_target": 1575.25, + "token_counts/before_think": 650.75 + }, + { + "avg_penalty/after_target": 1.9152032732963562, + "avg_penalty/after_think": 3.746575653553009, + "avg_penalty/before_target": 0.3997759148478508, + "avg_penalty/before_think": 0.4695097655057907, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.75, + "completions/max_terminated_length": 508.75, + "completions/mean_length": 202.3125, + "completions/mean_terminated_length": 202.3125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.589, + "grad_norm": 3.72648549079895, + "kl": 12.7109375, + "learning_rate": 8.66013814581708e-06, + "loss": 1.2197, + "num_tokens": 36213675.0, + "reward": 1.61328125, + "reward_std": 0.7455717772245407, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.34423689544200897, + "step": 1178, + "token_counts/after_target": 419.25, + "token_counts/after_think": 52.5, + "token_counts/before_target": 1780.75, + "token_counts/before_think": 984.5 + }, + { + "avg_penalty/after_target": 2.9876019954681396, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.3679503798484802, + "avg_penalty/before_think": 0.5362621620297432, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 834.0, + "completions/max_terminated_length": 634.0, + "completions/mean_length": 250.53125, + "completions/mean_terminated_length": 225.63646697998047, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.5895, + "grad_norm": 6.559804916381836, + "kl": 23.0, + "learning_rate": 8.642844275656957e-06, + "loss": 2.1052, + "num_tokens": 36241613.0, + "reward": 1.5859375, + "reward_std": 0.7412333786487579, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4079566150903702, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.3507167771458626, + "step": 1179, + "token_counts/after_target": 1153.25, + "token_counts/after_think": 15.0, + "token_counts/before_target": 1854.75, + "token_counts/before_think": 985.5 + }, + { + "avg_penalty/after_target": 2.547134667634964, + "avg_penalty/after_think": 3.869314968585968, + "avg_penalty/before_target": 0.36601877212524414, + "avg_penalty/before_think": 0.47333189845085144, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.5, + "completions/max_terminated_length": 495.5, + "completions/mean_length": 203.921875, + "completions/mean_terminated_length": 203.921875, + "completions/min_length": 56.75, + "completions/min_terminated_length": 56.75, + "epoch": 0.59, + "grad_norm": 7.13609504699707, + "kl": 13.53125, + "learning_rate": 8.625554539628536e-06, + "loss": 1.4417, + "num_tokens": 36265896.0, + "reward": 1.703125, + "reward_std": 0.66729936003685, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38336414843797684, + "rewards/tag_count_reward/mean": 0.875, + "rewards/tag_count_reward/std": 0.2972251623868942, + "step": 1180, + "token_counts/after_target": 496.75, + "token_counts/after_think": 88.5, + "token_counts/before_target": 1791.5, + "token_counts/before_think": 886.0 + }, + { + "avg_penalty/after_target": 2.3578876852989197, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3506801016628742, + "avg_penalty/before_think": 0.46189134567976, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 227.390625, + "completions/mean_terminated_length": 227.390625, + "completions/min_length": 45.25, + "completions/min_terminated_length": 45.25, + "epoch": 0.5905, + "grad_norm": 8.04395866394043, + "kl": 22.09375, + "learning_rate": 8.60826899039935e-06, + "loss": 1.6636, + "num_tokens": 36292417.0, + "reward": 1.53515625, + "reward_std": 0.7821587473154068, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4260597825050354, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.36984870582818985, + "step": 1181, + "token_counts/after_target": 584.75, + "token_counts/after_think": 49.5, + "token_counts/before_target": 1955.0, + "token_counts/before_think": 1049.0 + }, + { + "avg_penalty/after_target": 2.1193195581436157, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4875180572271347, + "avg_penalty/before_think": 0.509366400539875, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 827.5, + "completions/max_terminated_length": 619.5, + "completions/mean_length": 249.390625, + "completions/mean_terminated_length": 235.09479331970215, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.591, + "grad_norm": 8.870963096618652, + "kl": 29.6875, + "learning_rate": 8.590987680624174e-06, + "loss": 2.3076, + "num_tokens": 36318522.0, + "reward": 1.5703125, + "reward_std": 0.7170143127441406, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4185478091239929, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.33064714446663857, + "step": 1182, + "token_counts/after_target": 650.75, + "token_counts/after_think": 79.0, + "token_counts/before_target": 1970.5, + "token_counts/before_think": 1290.0 + }, + { + "avg_penalty/after_target": 2.9791935086250305, + "avg_penalty/after_think": 2.6174747347831726, + "avg_penalty/before_target": 0.5373440980911255, + "avg_penalty/before_think": 0.4383440986275673, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 698.0, + "completions/max_terminated_length": 698.0, + "completions/mean_length": 223.484375, + "completions/mean_terminated_length": 223.484375, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.5915, + "grad_norm": 9.26897144317627, + "kl": 26.703125, + "learning_rate": 8.573710662944884e-06, + "loss": 2.0516, + "num_tokens": 36344681.0, + "reward": 1.54296875, + "reward_std": 0.7967594414949417, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.18616948276758194, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.3811737895011902, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.31933508813381195, + "step": 1183, + "token_counts/after_target": 858.25, + "token_counts/after_think": 35.75, + "token_counts/before_target": 2005.25, + "token_counts/before_think": 676.5 + }, + { + "avg_penalty/after_target": 2.7302218675613403, + "avg_penalty/after_think": 3.710500121116638, + "avg_penalty/before_target": 0.36741164326667786, + "avg_penalty/before_think": 0.5730115920305252, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 683.0, + "completions/max_terminated_length": 683.0, + "completions/mean_length": 239.84375, + "completions/mean_terminated_length": 239.84375, + "completions/min_length": 48.25, + "completions/min_terminated_length": 48.25, + "epoch": 0.592, + "grad_norm": 8.235810279846191, + "kl": 31.71875, + "learning_rate": 8.55643798999027e-06, + "loss": 2.487, + "num_tokens": 36368671.0, + "reward": 1.4453125, + "reward_std": 0.814336508512497, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.479247085750103, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.3657388612627983, + "step": 1184, + "token_counts/after_target": 873.5, + "token_counts/after_think": 159.75, + "token_counts/before_target": 1991.75, + "token_counts/before_think": 812.5 + }, + { + "avg_penalty/after_target": 2.569019854068756, + "avg_penalty/after_think": 3.5129497051239014, + "avg_penalty/before_target": 0.5618335381150246, + "avg_penalty/before_think": 0.520808108150959, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 828.5, + "completions/max_terminated_length": 709.25, + "completions/mean_length": 289.90625, + "completions/mean_terminated_length": 278.9072952270508, + "completions/min_length": 52.5, + "completions/min_terminated_length": 52.5, + "epoch": 0.5925, + "grad_norm": 10.393336296081543, + "kl": 31.28125, + "learning_rate": 8.539169714375885e-06, + "loss": 2.4148, + "num_tokens": 36399097.0, + "reward": 1.4453125, + "reward_std": 0.7989259213209152, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.47354350984096527, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.3577791154384613, + "step": 1185, + "token_counts/after_target": 1302.75, + "token_counts/after_think": 64.25, + "token_counts/before_target": 1972.75, + "token_counts/before_think": 1298.75 + }, + { + "avg_penalty/after_target": 2.66848886013031, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.35635756328701973, + "avg_penalty/before_think": 0.5231147408485413, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.75, + "completions/max_terminated_length": 492.75, + "completions/mean_length": 191.53125, + "completions/mean_terminated_length": 191.53125, + "completions/min_length": 58.5, + "completions/min_terminated_length": 58.5, + "epoch": 0.593, + "grad_norm": 7.21679162979126, + "kl": 22.421875, + "learning_rate": 8.521905888703894e-06, + "loss": 1.7189, + "num_tokens": 36421483.0, + "reward": 1.50390625, + "reward_std": 0.8029944598674774, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.44495995342731476, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.38083241134881973, + "step": 1186, + "token_counts/after_target": 474.5, + "token_counts/after_think": 38.75, + "token_counts/before_target": 1698.0, + "token_counts/before_think": 853.25 + }, + { + "avg_penalty/after_target": 2.152956634759903, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.37339315563440323, + "avg_penalty/before_think": 0.5753150433301926, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 648.25, + "completions/max_terminated_length": 648.25, + "completions/mean_length": 296.765625, + "completions/mean_terminated_length": 296.765625, + "completions/min_length": 63.5, + "completions/min_terminated_length": 63.5, + "epoch": 0.5935, + "grad_norm": 5.306480884552002, + "kl": 19.890625, + "learning_rate": 8.504646565562907e-06, + "loss": 1.6123, + "num_tokens": 36450764.0, + "reward": 1.58203125, + "reward_std": 0.7011785060167313, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4185478091239929, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.30423202365636826, + "step": 1187, + "token_counts/after_target": 728.0, + "token_counts/after_think": 331.0, + "token_counts/before_target": 2474.75, + "token_counts/before_think": 1214.5 + }, + { + "avg_penalty/after_target": 2.43087500333786, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3489353582262993, + "avg_penalty/before_think": 0.6313558369874954, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 199.515625, + "completions/mean_terminated_length": 199.515625, + "completions/min_length": 55.5, + "completions/min_terminated_length": 55.5, + "epoch": 0.594, + "grad_norm": 3.640990734100342, + "kl": 17.3671875, + "learning_rate": 8.487391797527808e-06, + "loss": 1.4719, + "num_tokens": 36474781.0, + "reward": 1.6015625, + "reward_std": 0.7347017377614975, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.33086491003632545, + "step": 1188, + "token_counts/after_target": 469.25, + "token_counts/after_think": 28.75, + "token_counts/before_target": 1925.0, + "token_counts/before_think": 769.25 + }, + { + "avg_penalty/after_target": 3.3189454078674316, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3932493254542351, + "avg_penalty/before_think": 0.508488617837429, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 699.25, + "completions/max_terminated_length": 685.5, + "completions/mean_length": 309.375, + "completions/mean_terminated_length": 299.06459045410156, + "completions/min_length": 79.75, + "completions/min_terminated_length": 79.75, + "epoch": 0.5945, + "grad_norm": 8.95439338684082, + "kl": 20.265625, + "learning_rate": 8.47014163715962e-06, + "loss": 1.9505, + "num_tokens": 36503973.0, + "reward": 1.44921875, + "reward_std": 0.7663559913635254, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.466681070625782, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.3309398740530014, + "step": 1189, + "token_counts/after_target": 1236.0, + "token_counts/after_think": 206.25, + "token_counts/before_target": 2552.0, + "token_counts/before_think": 955.75 + }, + { + "avg_penalty/after_target": 2.7667948603630066, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.40509654581546783, + "avg_penalty/before_think": 0.6427319049835205, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 581.5, + "completions/max_terminated_length": 581.5, + "completions/mean_length": 246.5, + "completions/mean_terminated_length": 246.5, + "completions/min_length": 37.75, + "completions/min_terminated_length": 37.75, + "epoch": 0.595, + "grad_norm": 2.9971508979797363, + "kl": 19.0625, + "learning_rate": 8.452896137005322e-06, + "loss": 1.6816, + "num_tokens": 36529941.0, + "reward": 1.4375, + "reward_std": 0.8371139466762543, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.48148179799318314, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.39517470449209213, + "step": 1190, + "token_counts/after_target": 960.0, + "token_counts/after_think": 41.75, + "token_counts/before_target": 2210.0, + "token_counts/before_think": 732.25 + }, + { + "avg_penalty/after_target": 3.0354800820350647, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.43210216984152794, + "avg_penalty/before_think": 0.4361015558242798, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 589.25, + "completions/max_terminated_length": 524.25, + "completions/mean_length": 233.8125, + "completions/mean_terminated_length": 223.08646392822266, + "completions/min_length": 38.75, + "completions/min_terminated_length": 38.75, + "epoch": 0.5955, + "grad_norm": 5.659383296966553, + "kl": 12.984375, + "learning_rate": 8.43565534959769e-06, + "loss": 1.2675, + "num_tokens": 36555609.0, + "reward": 1.65234375, + "reward_std": 0.705091804265976, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.39656074345111847, + "rewards/tag_count_reward/mean": 0.85546875, + "rewards/tag_count_reward/std": 0.32354310154914856, + "step": 1191, + "token_counts/after_target": 792.75, + "token_counts/after_think": 51.75, + "token_counts/before_target": 2065.5, + "token_counts/before_think": 831.0 + }, + { + "avg_penalty/after_target": 2.889669418334961, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.354168139398098, + "avg_penalty/before_think": 0.5771506652235985, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.5, + "completions/max_terminated_length": 555.5, + "completions/mean_length": 234.4375, + "completions/mean_terminated_length": 234.4375, + "completions/min_length": 61.25, + "completions/min_terminated_length": 61.25, + "epoch": 0.596, + "grad_norm": 6.3526082038879395, + "kl": 13.45703125, + "learning_rate": 8.418419327455166e-06, + "loss": 1.2833, + "num_tokens": 36579029.0, + "reward": 1.59765625, + "reward_std": 0.7470792382955551, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42867646366357803, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.34890443831682205, + "step": 1192, + "token_counts/after_target": 808.5, + "token_counts/after_think": 23.75, + "token_counts/before_target": 1836.25, + "token_counts/before_think": 1082.5 + }, + { + "avg_penalty/after_target": 3.5340269207954407, + "avg_penalty/after_think": 3.8228753805160522, + "avg_penalty/before_target": 0.2605091966688633, + "avg_penalty/before_think": 0.7247716635465622, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.5, + "completions/max_terminated_length": 607.5, + "completions/mean_length": 221.703125, + "completions/mean_terminated_length": 221.703125, + "completions/min_length": 40.25, + "completions/min_terminated_length": 40.25, + "epoch": 0.5965, + "grad_norm": 5.498198986053467, + "kl": 21.21875, + "learning_rate": 8.401188123081653e-06, + "loss": 1.7102, + "num_tokens": 36607362.0, + "reward": 1.38671875, + "reward_std": 0.9143245965242386, + "rewards/accuracy_reward/mean": NaN, + "rewards/accuracy_reward/std": NaN, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4761601909995079, + "rewards/tag_count_reward/mean": 0.69921875, + "rewards/tag_count_reward/std": 0.4576219916343689, + "step": 1193, + "token_counts/after_target": 583.5, + "token_counts/after_think": 20.75, + "token_counts/before_target": 2103.25, + "token_counts/before_think": 839.75 + }, + { + "avg_penalty/after_target": 2.815902829170227, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.45542750507593155, + "avg_penalty/before_think": 0.5623437985777855, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 695.75, + "completions/max_terminated_length": 579.25, + "completions/mean_length": 283.390625, + "completions/mean_terminated_length": 271.5843811035156, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.597, + "grad_norm": 8.50460433959961, + "kl": 17.03125, + "learning_rate": 8.38396178896639e-06, + "loss": 1.734, + "num_tokens": 36635131.0, + "reward": 1.44140625, + "reward_std": 0.807770162820816, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.47669370472431183, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.35849592462182045, + "step": 1194, + "token_counts/after_target": 1128.5, + "token_counts/after_think": 98.0, + "token_counts/before_target": 2203.0, + "token_counts/before_think": 1104.75 + }, + { + "avg_penalty/after_target": 2.4043559432029724, + "avg_penalty/after_think": 3.6885305047035217, + "avg_penalty/before_target": 0.39945774525403976, + "avg_penalty/before_think": 0.5972466468811035, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.25, + "completions/max_terminated_length": 639.25, + "completions/mean_length": 229.578125, + "completions/mean_terminated_length": 229.578125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.5975, + "grad_norm": 3.1359148025512695, + "kl": 13.9296875, + "learning_rate": 8.366740377583781e-06, + "loss": 1.2022, + "num_tokens": 36660704.0, + "reward": 1.6484375, + "reward_std": 0.5932402461767197, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.30717839300632477, + "rewards/tag_count_reward/mean": 0.8359375, + "rewards/tag_count_reward/std": 0.29433435946702957, + "step": 1195, + "token_counts/after_target": 524.25, + "token_counts/after_think": 58.25, + "token_counts/before_target": 2338.5, + "token_counts/before_think": 752.25 + }, + { + "avg_penalty/after_target": 3.031473159790039, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.35591063648462296, + "avg_penalty/before_think": 0.43037451058626175, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 604.75, + "completions/max_terminated_length": 604.75, + "completions/mean_length": 199.9375, + "completions/mean_terminated_length": 199.9375, + "completions/min_length": 59.25, + "completions/min_terminated_length": 59.25, + "epoch": 0.598, + "grad_norm": 2.7636263370513916, + "kl": 20.171875, + "learning_rate": 8.349523941393224e-06, + "loss": 1.6694, + "num_tokens": 36683660.0, + "reward": 1.4453125, + "reward_std": 0.8565750271081924, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.47083858400583267, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.39911653101444244, + "step": 1196, + "token_counts/after_target": 594.5, + "token_counts/after_think": 33.75, + "token_counts/before_target": 1960.0, + "token_counts/before_think": 610.75 + }, + { + "avg_penalty/after_target": 2.0363034307956696, + "avg_penalty/after_think": 2.7931575179100037, + "avg_penalty/before_target": 0.36065446585416794, + "avg_penalty/before_think": 0.48769211024045944, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.0, + "completions/max_terminated_length": 550.0, + "completions/mean_length": 234.59375, + "completions/mean_terminated_length": 234.59375, + "completions/min_length": 64.25, + "completions/min_terminated_length": 64.25, + "epoch": 0.5985, + "grad_norm": 3.408714771270752, + "kl": 14.234375, + "learning_rate": 8.332312532838978e-06, + "loss": 1.3215, + "num_tokens": 36706818.0, + "reward": 1.57421875, + "reward_std": 0.7769313454627991, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4229728877544403, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.3703637421131134, + "step": 1197, + "token_counts/after_target": 602.0, + "token_counts/after_think": 131.0, + "token_counts/before_target": 2016.25, + "token_counts/before_think": 1004.25 + }, + { + "avg_penalty/after_target": 2.784388452768326, + "avg_penalty/after_think": 3.724020838737488, + "avg_penalty/before_target": 0.4798629768192768, + "avg_penalty/before_think": 0.7060720100998878, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 703.5, + "completions/max_terminated_length": 600.25, + "completions/mean_length": 231.640625, + "completions/mean_terminated_length": 219.3687515258789, + "completions/min_length": 53.25, + "completions/min_terminated_length": 53.25, + "epoch": 0.599, + "grad_norm": 11.360419273376465, + "kl": 13.9921875, + "learning_rate": 8.315106204349976e-06, + "loss": 1.6183, + "num_tokens": 36731739.0, + "reward": 1.63671875, + "reward_std": 0.7292506992816925, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4097762927412987, + "rewards/tag_count_reward/mean": 0.83984375, + "rewards/tag_count_reward/std": 0.3385658413171768, + "step": 1198, + "token_counts/after_target": 1010.0, + "token_counts/after_think": 102.0, + "token_counts/before_target": 1869.75, + "token_counts/before_think": 724.5 + }, + { + "avg_penalty/after_target": 2.4719162583351135, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.34232208132743835, + "avg_penalty/before_think": 0.48689544945955276, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 611.5, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 256.796875, + "completions/mean_terminated_length": 245.99166870117188, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.5995, + "grad_norm": 7.468428134918213, + "kl": 17.78125, + "learning_rate": 8.297905008339677e-06, + "loss": 1.3538, + "num_tokens": 36756062.0, + "reward": 1.4453125, + "reward_std": 0.8470521122217178, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4534844756126404, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.4107813388109207, + "step": 1199, + "token_counts/after_target": 697.0, + "token_counts/after_think": 35.0, + "token_counts/before_target": 2556.75, + "token_counts/before_think": 820.0 + }, + { + "avg_penalty/after_target": 2.296560287475586, + "avg_penalty/after_think": 2.951765537261963, + "avg_penalty/before_target": 0.40994641929864883, + "avg_penalty/before_think": 0.8420542180538177, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 660.75, + "completions/max_terminated_length": 660.75, + "completions/mean_length": 325.53125, + "completions/mean_terminated_length": 325.53125, + "completions/min_length": 69.75, + "completions/min_terminated_length": 69.75, + "epoch": 0.6, + "grad_norm": 2.618452310562134, + "kl": 14.140625, + "learning_rate": 8.280708997205904e-06, + "loss": 1.3204, + "num_tokens": 36787008.0, + "reward": 1.5546875, + "reward_std": 0.7994018346071243, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44938503205776215, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.35437440872192383, + "step": 1200, + "token_counts/after_target": 1110.0, + "token_counts/after_think": 40.0, + "token_counts/before_target": 3015.25, + "token_counts/before_think": 1043.25 + }, + { + "avg_penalty/after_target": 3.3233978152275085, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.36355170235037804, + "avg_penalty/before_think": 0.5577127858996391, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.25, + "completions/max_terminated_length": 634.25, + "completions/mean_length": 238.1875, + "completions/mean_terminated_length": 238.1875, + "completions/min_length": 35.5, + "completions/min_terminated_length": 35.5, + "epoch": 0.6005, + "grad_norm": 7.937072277069092, + "kl": 12.03515625, + "learning_rate": 8.263518223330698e-06, + "loss": 1.3236, + "num_tokens": 36816188.0, + "reward": 1.68359375, + "reward_std": 0.7839090526103973, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.11180340498685837, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.38724804669618607, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.34561219066381454, + "step": 1201, + "token_counts/after_target": 856.25, + "token_counts/after_think": 104.75, + "token_counts/before_target": 1959.25, + "token_counts/before_think": 890.75 + }, + { + "avg_penalty/after_target": 2.9976620078086853, + "avg_penalty/after_think": 3.967422068119049, + "avg_penalty/before_target": 0.3752630837261677, + "avg_penalty/before_think": 0.5085486322641373, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 561.75, + "completions/max_terminated_length": 437.5, + "completions/mean_length": 236.6875, + "completions/mean_terminated_length": 224.9260482788086, + "completions/min_length": 41.75, + "completions/min_terminated_length": 41.75, + "epoch": 0.601, + "grad_norm": 8.798630714416504, + "kl": 19.5, + "learning_rate": 8.246332739080131e-06, + "loss": 1.4296, + "num_tokens": 36840408.0, + "reward": 1.40625, + "reward_std": 0.8474786728620529, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.4788651168346405, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.3959256485104561, + "step": 1202, + "token_counts/after_target": 700.75, + "token_counts/after_think": 38.0, + "token_counts/before_target": 1994.5, + "token_counts/before_think": 1053.75 + }, + { + "avg_penalty/after_target": 2.6637499928474426, + "avg_penalty/after_think": 1.759602427482605, + "avg_penalty/before_target": 0.394389808177948, + "avg_penalty/before_think": 0.43994250893592834, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.0, + "completions/max_terminated_length": 542.0, + "completions/mean_length": 230.40625, + "completions/mean_terminated_length": 230.40625, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.6015, + "grad_norm": 7.931985855102539, + "kl": 27.34375, + "learning_rate": 8.22915259680417e-06, + "loss": 2.0969, + "num_tokens": 36866578.0, + "reward": 1.125, + "reward_std": 0.9414405077695847, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.515625, + "rewards/format_reward/std": 0.5112857818603516, + "rewards/tag_count_reward/mean": 0.609375, + "rewards/tag_count_reward/std": 0.44976506382226944, + "step": 1203, + "token_counts/after_target": 849.25, + "token_counts/after_think": 15.25, + "token_counts/before_target": 2162.25, + "token_counts/before_think": 659.75 + }, + { + "avg_penalty/after_target": 2.02480149269104, + "avg_penalty/after_think": 3.6947625875473022, + "avg_penalty/before_target": 0.5027395710349083, + "avg_penalty/before_think": 0.5533972457051277, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 827.75, + "completions/max_terminated_length": 809.5, + "completions/mean_length": 303.84375, + "completions/mean_terminated_length": 292.11458587646484, + "completions/min_length": 77.5, + "completions/min_terminated_length": 77.5, + "epoch": 0.602, + "grad_norm": 2.262709140777588, + "kl": 14.234375, + "learning_rate": 8.211977848836505e-06, + "loss": 1.2723, + "num_tokens": 36898136.0, + "reward": 1.51953125, + "reward_std": 0.8264408260583878, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.47083858400583267, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.37902653217315674, + "step": 1204, + "token_counts/after_target": 1000.0, + "token_counts/after_think": 72.5, + "token_counts/before_target": 2426.5, + "token_counts/before_think": 1362.5 + }, + { + "avg_penalty/after_target": 2.783906191587448, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4019703194499016, + "avg_penalty/before_think": 0.4361482113599777, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 627.5, + "completions/max_terminated_length": 627.5, + "completions/mean_length": 230.84375, + "completions/mean_terminated_length": 230.84375, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.6025, + "grad_norm": 7.284293174743652, + "kl": 24.65625, + "learning_rate": 8.194808547494401e-06, + "loss": 1.8973, + "num_tokens": 36923374.0, + "reward": 1.32421875, + "reward_std": 0.8856752067804337, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.49345622956752777, + "rewards/tag_count_reward/mean": 0.69921875, + "rewards/tag_count_reward/std": 0.41239843517541885, + "step": 1205, + "token_counts/after_target": 703.5, + "token_counts/after_think": 33.5, + "token_counts/before_target": 2078.5, + "token_counts/before_think": 878.0 + }, + { + "avg_penalty/after_target": 2.5418298542499542, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.36508213356137276, + "avg_penalty/before_think": 0.3966113328933716, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.5, + "completions/max_terminated_length": 456.5, + "completions/mean_length": 217.0, + "completions/mean_terminated_length": 217.0, + "completions/min_length": 38.5, + "completions/min_terminated_length": 38.5, + "epoch": 0.603, + "grad_norm": 4.704512119293213, + "kl": 16.84375, + "learning_rate": 8.177644745078525e-06, + "loss": 1.343, + "num_tokens": 36950190.0, + "reward": 1.45703125, + "reward_std": 0.8925604075193405, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.46296359598636627, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.43601298332214355, + "step": 1206, + "token_counts/after_target": 596.5, + "token_counts/after_think": 60.5, + "token_counts/before_target": 2031.25, + "token_counts/before_think": 783.75 + }, + { + "avg_penalty/after_target": 3.3117825984954834, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.27000949904322624, + "avg_penalty/before_think": 0.6366135329008102, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 587.75, + "completions/max_terminated_length": 587.75, + "completions/mean_length": 219.84375, + "completions/mean_terminated_length": 219.84375, + "completions/min_length": 51.25, + "completions/min_terminated_length": 51.25, + "epoch": 0.6035, + "grad_norm": 2.596388578414917, + "kl": 16.40625, + "learning_rate": 8.1604864938728e-06, + "loss": 1.4974, + "num_tokens": 36978084.0, + "reward": 1.58203125, + "reward_std": 0.7656923532485962, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.43303824216127396, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.3380301594734192, + "step": 1207, + "token_counts/after_target": 572.0, + "token_counts/after_think": 22.5, + "token_counts/before_target": 2041.5, + "token_counts/before_think": 881.5 + }, + { + "avg_penalty/after_target": 2.4804799258708954, + "avg_penalty/after_think": 3.826418101787567, + "avg_penalty/before_target": 0.3916171230375767, + "avg_penalty/before_think": 0.466479629278183, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.75, + "completions/max_terminated_length": 492.75, + "completions/mean_length": 216.25, + "completions/mean_terminated_length": 216.25, + "completions/min_length": 46.75, + "completions/min_terminated_length": 46.75, + "epoch": 0.604, + "grad_norm": 3.1753997802734375, + "kl": 13.21875, + "learning_rate": 8.143333846144231e-06, + "loss": 1.2194, + "num_tokens": 37000884.0, + "reward": 1.48828125, + "reward_std": 0.823275551199913, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4503342807292938, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.3775478005409241, + "step": 1208, + "token_counts/after_target": 571.75, + "token_counts/after_think": 117.0, + "token_counts/before_target": 1743.5, + "token_counts/before_think": 1027.75 + }, + { + "avg_penalty/after_target": 2.5889079570770264, + "avg_penalty/after_think": 3.9862678050994873, + "avg_penalty/before_target": 0.35728855431079865, + "avg_penalty/before_think": 0.5733926519751549, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 626.5, + "completions/max_terminated_length": 478.75, + "completions/mean_length": 263.28125, + "completions/mean_terminated_length": 251.63958740234375, + "completions/min_length": 62.5, + "completions/min_terminated_length": 62.5, + "epoch": 0.6045, + "grad_norm": 3.5761566162109375, + "kl": 17.765625, + "learning_rate": 8.126186854142752e-06, + "loss": 1.6069, + "num_tokens": 37025446.0, + "reward": 1.41796875, + "reward_std": 0.8460703045129776, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.46875541657209396, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.40200673788785934, + "step": 1209, + "token_counts/after_target": 808.25, + "token_counts/after_think": 170.0, + "token_counts/before_target": 2192.25, + "token_counts/before_think": 1042.0 + }, + { + "avg_penalty/after_target": 2.27080962061882, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.47305090725421906, + "avg_penalty/before_think": 0.4608401730656624, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 643.75, + "completions/max_terminated_length": 587.25, + "completions/mean_length": 219.9375, + "completions/mean_terminated_length": 207.04270935058594, + "completions/min_length": 52.75, + "completions/min_terminated_length": 52.75, + "epoch": 0.605, + "grad_norm": 3.5289154052734375, + "kl": 18.89453125, + "learning_rate": 8.109045570101086e-06, + "loss": 1.6876, + "num_tokens": 37049618.0, + "reward": 1.47265625, + "reward_std": 0.8037658333778381, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.42206869274377823, + "rewards/tag_count_reward/mean": 0.75390625, + "rewards/tag_count_reward/std": 0.379179872572422, + "step": 1210, + "token_counts/after_target": 856.75, + "token_counts/after_think": 45.0, + "token_counts/before_target": 1840.0, + "token_counts/before_think": 777.25 + }, + { + "avg_penalty/after_target": 2.39974582195282, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3358514681458473, + "avg_penalty/before_think": 0.5623903647065163, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.75, + "completions/max_terminated_length": 553.75, + "completions/mean_length": 223.65625, + "completions/mean_terminated_length": 223.65625, + "completions/min_length": 49.5, + "completions/min_terminated_length": 49.5, + "epoch": 0.6055, + "grad_norm": 2.8704066276550293, + "kl": 13.8046875, + "learning_rate": 8.091910046234552e-06, + "loss": 1.1772, + "num_tokens": 37077324.0, + "reward": 1.56640625, + "reward_std": 0.781192809343338, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.44091323018074036, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.3620489314198494, + "step": 1211, + "token_counts/after_target": 442.5, + "token_counts/after_think": 118.0, + "token_counts/before_target": 2036.0, + "token_counts/before_think": 982.0 + }, + { + "avg_penalty/after_target": 2.3768556118011475, + "avg_penalty/after_think": 1.8849756121635437, + "avg_penalty/before_target": 0.4637776091694832, + "avg_penalty/before_think": 0.44211896508932114, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 697.25, + "completions/max_terminated_length": 697.25, + "completions/mean_length": 251.890625, + "completions/mean_terminated_length": 251.890625, + "completions/min_length": 61.25, + "completions/min_terminated_length": 61.25, + "epoch": 0.606, + "grad_norm": 5.206826686859131, + "kl": 18.890625, + "learning_rate": 8.074780334740929e-06, + "loss": 1.7501, + "num_tokens": 37102037.0, + "reward": 1.5546875, + "reward_std": 0.7996904253959656, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4154609143733978, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.39062195271253586, + "step": 1212, + "token_counts/after_target": 859.5, + "token_counts/after_think": 122.25, + "token_counts/before_target": 1920.0, + "token_counts/before_think": 1128.5 + }, + { + "avg_penalty/after_target": 2.3509649634361267, + "avg_penalty/after_think": 3.750344753265381, + "avg_penalty/before_target": 0.47226879745721817, + "avg_penalty/before_think": 0.47018367052078247, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 745.5, + "completions/max_terminated_length": 745.5, + "completions/mean_length": 282.6875, + "completions/mean_terminated_length": 282.6875, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.6065, + "grad_norm": 7.098108768463135, + "kl": 24.15625, + "learning_rate": 8.057656487800283e-06, + "loss": 1.8806, + "num_tokens": 37130961.0, + "reward": 1.2421875, + "reward_std": 0.8908717483282089, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.4761601909995079, + "rewards/tag_count_reward/mean": 0.6484375, + "rewards/tag_count_reward/std": 0.43302205204963684, + "step": 1213, + "token_counts/after_target": 986.25, + "token_counts/after_think": 88.75, + "token_counts/before_target": 2426.75, + "token_counts/before_think": 1021.25 + }, + { + "avg_penalty/after_target": 2.7372381389141083, + "avg_penalty/after_think": 2.7967806458473206, + "avg_penalty/before_target": 0.4929578974843025, + "avg_penalty/before_think": 0.512685053050518, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.5, + "completions/max_terminated_length": 695.5, + "completions/mean_length": 275.828125, + "completions/mean_terminated_length": 275.828125, + "completions/min_length": 56.5, + "completions/min_terminated_length": 56.5, + "epoch": 0.607, + "grad_norm": 5.761022567749023, + "kl": 18.515625, + "learning_rate": 8.040538557574822e-06, + "loss": 1.7079, + "num_tokens": 37161398.0, + "reward": 1.4921875, + "reward_std": 0.8412300497293472, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.39774149656295776, + "step": 1214, + "token_counts/after_target": 1112.5, + "token_counts/after_think": 34.0, + "token_counts/before_target": 1960.5, + "token_counts/before_think": 1306.25 + }, + { + "avg_penalty/after_target": 2.0378445386886597, + "avg_penalty/after_think": 3.9582483172416687, + "avg_penalty/before_target": 0.39681681245565414, + "avg_penalty/before_think": 0.47674626111984253, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.25, + "completions/max_terminated_length": 564.25, + "completions/mean_length": 221.046875, + "completions/mean_terminated_length": 221.046875, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.6075, + "grad_norm": 6.328453063964844, + "kl": 20.125, + "learning_rate": 8.023426596208739e-06, + "loss": 1.5776, + "num_tokens": 37185401.0, + "reward": 1.5078125, + "reward_std": 0.8528490513563156, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.45508860796689987, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.41674554347991943, + "step": 1215, + "token_counts/after_target": 497.0, + "token_counts/after_think": 43.0, + "token_counts/before_target": 1997.75, + "token_counts/before_think": 999.0 + }, + { + "avg_penalty/after_target": 3.1060957312583923, + "avg_penalty/after_think": 2.5831324458122253, + "avg_penalty/before_target": 0.3006952777504921, + "avg_penalty/before_think": 0.5639495849609375, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.25, + "completions/max_terminated_length": 485.25, + "completions/mean_length": 187.171875, + "completions/mean_terminated_length": 187.171875, + "completions/min_length": 36.5, + "completions/min_terminated_length": 36.5, + "epoch": 0.608, + "grad_norm": 6.275959491729736, + "kl": 22.125, + "learning_rate": 8.00632065582803e-06, + "loss": 1.7148, + "num_tokens": 37207876.0, + "reward": 1.39453125, + "reward_std": 0.9133233428001404, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.47360680997371674, + "rewards/tag_count_reward/mean": 0.70703125, + "rewards/tag_count_reward/std": 0.4503571167588234, + "step": 1216, + "token_counts/after_target": 403.75, + "token_counts/after_think": 41.0, + "token_counts/before_target": 1864.5, + "token_counts/before_think": 685.5 + }, + { + "avg_penalty/after_target": 2.626821666955948, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3731185793876648, + "avg_penalty/before_think": 0.5639810934662819, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 246.375, + "completions/mean_terminated_length": 246.375, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.6085, + "grad_norm": 4.501637935638428, + "kl": 17.421875, + "learning_rate": 7.989220788540356e-06, + "loss": 1.6336, + "num_tokens": 37232636.0, + "reward": 1.515625, + "reward_std": 0.8239849805831909, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4462348371744156, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.39506568014621735, + "step": 1217, + "token_counts/after_target": 867.75, + "token_counts/after_think": 19.25, + "token_counts/before_target": 1890.0, + "token_counts/before_think": 1165.0 + }, + { + "avg_penalty/after_target": 2.8692277371883392, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.27017221227288246, + "avg_penalty/before_think": 0.33048221468925476, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.75, + "completions/max_terminated_length": 351.75, + "completions/mean_length": 133.921875, + "completions/mean_terminated_length": 133.921875, + "completions/min_length": 26.5, + "completions/min_terminated_length": 26.5, + "epoch": 0.609, + "grad_norm": 7.941879749298096, + "kl": 17.265625, + "learning_rate": 7.972127046434878e-06, + "loss": 1.2134, + "num_tokens": 37251079.0, + "reward": 1.40234375, + "reward_std": 0.8809695243835449, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.47669370472431183, + "rewards/tag_count_reward/mean": 0.73046875, + "rewards/tag_count_reward/std": 0.4170132204890251, + "step": 1218, + "token_counts/after_target": 271.5, + "token_counts/after_think": 0.25, + "token_counts/before_target": 1066.5, + "token_counts/before_think": 804.5 + }, + { + "avg_penalty/after_target": 3.021578997373581, + "avg_penalty/after_think": 2.69623726606369, + "avg_penalty/before_target": 0.2797977887094021, + "avg_penalty/before_think": 0.4263085424900055, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.75, + "completions/max_terminated_length": 413.75, + "completions/mean_length": 199.984375, + "completions/mean_terminated_length": 199.984375, + "completions/min_length": 38.75, + "completions/min_terminated_length": 38.75, + "epoch": 0.6095, + "grad_norm": 2.812788486480713, + "kl": 15.3125, + "learning_rate": 7.955039481582098e-06, + "loss": 1.2613, + "num_tokens": 37273254.0, + "reward": 1.37109375, + "reward_std": 0.9017152488231659, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.48456869274377823, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.429431788623333, + "step": 1219, + "token_counts/after_target": 520.0, + "token_counts/after_think": 62.5, + "token_counts/before_target": 1617.0, + "token_counts/before_think": 1000.25 + }, + { + "avg_penalty/after_target": 2.917058050632477, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3097006715834141, + "avg_penalty/before_think": 0.4973207674920559, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.25, + "completions/max_terminated_length": 541.25, + "completions/mean_length": 213.0625, + "completions/mean_terminated_length": 213.0625, + "completions/min_length": 51.75, + "completions/min_terminated_length": 51.75, + "epoch": 0.61, + "grad_norm": 4.194769382476807, + "kl": 23.296875, + "learning_rate": 7.937958146033706e-06, + "loss": 1.8738, + "num_tokens": 37296522.0, + "reward": 1.3046875, + "reward_std": 0.9278936088085175, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.49776528775691986, + "rewards/tag_count_reward/mean": 0.6796875, + "rewards/tag_count_reward/std": 0.4473518282175064, + "step": 1220, + "token_counts/after_target": 701.0, + "token_counts/after_think": 21.0, + "token_counts/before_target": 2100.0, + "token_counts/before_think": 587.0 + }, + { + "avg_penalty/after_target": 2.6098335683345795, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4071120545268059, + "avg_penalty/before_think": 0.5150745734572411, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 657.75, + "completions/max_terminated_length": 657.75, + "completions/mean_length": 236.28125, + "completions/mean_terminated_length": 236.28125, + "completions/min_length": 69.5, + "completions/min_terminated_length": 69.5, + "epoch": 0.6105, + "grad_norm": 5.576517105102539, + "kl": 17.953125, + "learning_rate": 7.92088309182241e-06, + "loss": 1.7073, + "num_tokens": 37320620.0, + "reward": 1.54296875, + "reward_std": 0.8471832573413849, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.43303824216127396, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.4153134897351265, + "step": 1221, + "token_counts/after_target": 797.5, + "token_counts/after_think": 34.25, + "token_counts/before_target": 2027.5, + "token_counts/before_think": 921.25 + }, + { + "avg_penalty/after_target": 1.9369195401668549, + "avg_penalty/after_think": 2.924275040626526, + "avg_penalty/before_target": 0.28653038665652275, + "avg_penalty/before_think": 0.40152087062597275, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.25, + "completions/max_terminated_length": 525.25, + "completions/mean_length": 212.328125, + "completions/mean_terminated_length": 212.328125, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.611, + "grad_norm": 3.8650364875793457, + "kl": 16.875, + "learning_rate": 7.903814370961785e-06, + "loss": 1.44, + "num_tokens": 37345745.0, + "reward": 1.515625, + "reward_std": 0.8622251898050308, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4414467439055443, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.4121345356106758, + "step": 1222, + "token_counts/after_target": 367.75, + "token_counts/after_think": 93.0, + "token_counts/before_target": 2014.5, + "token_counts/before_think": 922.0 + }, + { + "avg_penalty/after_target": 1.8291032314300537, + "avg_penalty/after_think": 3.927988827228546, + "avg_penalty/before_target": 0.37895020842552185, + "avg_penalty/before_think": 0.4525475576519966, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.25, + "completions/max_terminated_length": 531.25, + "completions/mean_length": 209.578125, + "completions/mean_terminated_length": 209.578125, + "completions/min_length": 63.5, + "completions/min_terminated_length": 63.5, + "epoch": 0.6115, + "grad_norm": 7.199634075164795, + "kl": 12.34375, + "learning_rate": 7.886752035446116e-06, + "loss": 1.301, + "num_tokens": 37370662.0, + "reward": 1.75390625, + "reward_std": 0.7488611936569214, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.11180340498685837, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.34860680997371674, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.3425466865301132, + "step": 1223, + "token_counts/after_target": 494.5, + "token_counts/after_think": 52.5, + "token_counts/before_target": 1919.5, + "token_counts/before_think": 886.75 + }, + { + "avg_penalty/after_target": 2.2520234286785126, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.40447624400258064, + "avg_penalty/before_think": 0.4667111337184906, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 701.5, + "completions/max_terminated_length": 701.5, + "completions/mean_length": 237.984375, + "completions/mean_terminated_length": 237.984375, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.612, + "grad_norm": 2.797238826751709, + "kl": 17.4765625, + "learning_rate": 7.869696137250235e-06, + "loss": 1.5622, + "num_tokens": 37395621.0, + "reward": 1.51171875, + "reward_std": 0.8137035816907883, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.42078252136707306, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.3987961709499359, + "step": 1224, + "token_counts/after_target": 635.5, + "token_counts/after_think": 156.25, + "token_counts/before_target": 2059.25, + "token_counts/before_think": 956.75 + }, + { + "avg_penalty/after_target": 1.931976556777954, + "avg_penalty/after_think": 3.939355194568634, + "avg_penalty/before_target": 0.4218490719795227, + "avg_penalty/before_think": 0.5157262608408928, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.25, + "completions/max_terminated_length": 638.25, + "completions/mean_length": 232.3125, + "completions/mean_terminated_length": 232.3125, + "completions/min_length": 54.75, + "completions/min_terminated_length": 54.75, + "epoch": 0.6125, + "grad_norm": 3.9698855876922607, + "kl": 20.328125, + "learning_rate": 7.852646728329368e-06, + "loss": 1.7744, + "num_tokens": 37419353.0, + "reward": 1.49609375, + "reward_std": 0.8603588938713074, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4308478757739067, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.43011847138404846, + "step": 1225, + "token_counts/after_target": 687.25, + "token_counts/after_think": 46.75, + "token_counts/before_target": 2071.0, + "token_counts/before_think": 912.0 + }, + { + "avg_penalty/after_target": 2.4343217313289642, + "avg_penalty/after_think": 2.847874939441681, + "avg_penalty/before_target": 0.43850477784872055, + "avg_penalty/before_think": 0.4422520101070404, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.5, + "completions/max_terminated_length": 558.5, + "completions/mean_length": 208.8125, + "completions/mean_terminated_length": 208.8125, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.613, + "grad_norm": 9.762346267700195, + "kl": 15.6875, + "learning_rate": 7.835603860618973e-06, + "loss": 1.679, + "num_tokens": 37440637.0, + "reward": 1.65234375, + "reward_std": 0.7202581763267517, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.37937305867671967, + "rewards/tag_count_reward/mean": 0.83984375, + "rewards/tag_count_reward/std": 0.34575045853853226, + "step": 1226, + "token_counts/after_target": 733.75, + "token_counts/after_think": 36.75, + "token_counts/before_target": 1621.75, + "token_counts/before_think": 948.75 + }, + { + "avg_penalty/after_target": 2.7558836340904236, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.41253549605607986, + "avg_penalty/before_think": 0.602677047252655, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 738.0, + "completions/max_terminated_length": 699.25, + "completions/mean_length": 276.421875, + "completions/mean_terminated_length": 265.7229232788086, + "completions/min_length": 34.25, + "completions/min_terminated_length": 34.25, + "epoch": 0.6135, + "grad_norm": 2.7240188121795654, + "kl": 23.390625, + "learning_rate": 7.818567586034578e-06, + "loss": 1.9604, + "num_tokens": 37468936.0, + "reward": 1.38671875, + "reward_std": 0.9335250407457352, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.47360680997371674, + "rewards/tag_count_reward/mean": 0.69921875, + "rewards/tag_count_reward/std": 0.4620719999074936, + "step": 1227, + "token_counts/after_target": 1106.75, + "token_counts/after_think": 28.0, + "token_counts/before_target": 2296.0, + "token_counts/before_think": 992.0 + }, + { + "avg_penalty/after_target": 2.0371318459510803, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4256369322538376, + "avg_penalty/before_think": 0.3185951039195061, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.75, + "completions/max_terminated_length": 493.75, + "completions/mean_length": 170.703125, + "completions/mean_terminated_length": 170.703125, + "completions/min_length": 39.75, + "completions/min_terminated_length": 39.75, + "epoch": 0.614, + "grad_norm": 2.5268683433532715, + "kl": 15.3125, + "learning_rate": 7.801537956471624e-06, + "loss": 1.307, + "num_tokens": 37488005.0, + "reward": 1.60546875, + "reward_std": 0.800348773598671, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4097762927412987, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.39267293363809586, + "step": 1228, + "token_counts/after_target": 315.5, + "token_counts/after_think": 38.0, + "token_counts/before_target": 1664.0, + "token_counts/before_think": 713.75 + }, + { + "avg_penalty/after_target": 2.7336656153202057, + "avg_penalty/after_think": 1.6194920539855957, + "avg_penalty/before_target": 0.4344271346926689, + "avg_penalty/before_think": 0.431364543735981, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 818.0, + "completions/max_terminated_length": 605.25, + "completions/mean_length": 273.453125, + "completions/mean_terminated_length": 237.13408279418945, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.6145, + "grad_norm": 3.312337875366211, + "kl": 27.28125, + "learning_rate": 7.784515023805328e-06, + "loss": 2.3904, + "num_tokens": 37513586.0, + "reward": 1.5078125, + "reward_std": 0.8148227035999298, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.43616948276758194, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.38955435901880264, + "step": 1229, + "token_counts/after_target": 1084.5, + "token_counts/after_think": 21.5, + "token_counts/before_target": 2481.5, + "token_counts/before_think": 787.75 + }, + { + "avg_penalty/after_target": 1.9812142550945282, + "avg_penalty/after_think": 3.9600754976272583, + "avg_penalty/before_target": 0.49947022646665573, + "avg_penalty/before_think": 0.5404147133231163, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 769.75, + "completions/max_terminated_length": 658.0, + "completions/mean_length": 223.84375, + "completions/mean_terminated_length": 211.21354293823242, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.615, + "grad_norm": 5.744661808013916, + "kl": 22.75, + "learning_rate": 7.767498839890489e-06, + "loss": 1.8702, + "num_tokens": 37538120.0, + "reward": 1.6484375, + "reward_std": 0.7598292678594589, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4097762927412987, + "rewards/tag_count_reward/mean": 0.8359375, + "rewards/tag_count_reward/std": 0.34529730677604675, + "step": 1230, + "token_counts/after_target": 708.0, + "token_counts/after_think": 72.5, + "token_counts/before_target": 2080.25, + "token_counts/before_think": 720.75 + }, + { + "avg_penalty/after_target": 2.590252935886383, + "avg_penalty/after_think": 2.7135100960731506, + "avg_penalty/before_target": 0.3391426168382168, + "avg_penalty/before_think": 0.3130217418074608, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 649.25, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 210.46875, + "completions/mean_terminated_length": 197.23437881469727, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.6155, + "grad_norm": 5.7655768394470215, + "kl": 20.28125, + "learning_rate": 7.750489456561351e-06, + "loss": 1.8915, + "num_tokens": 37561942.0, + "reward": 1.64453125, + "reward_std": 0.6961973756551743, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.36136941611766815, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.3375837951898575, + "step": 1231, + "token_counts/after_target": 867.75, + "token_counts/after_think": 20.75, + "token_counts/before_target": 1715.75, + "token_counts/before_think": 763.25 + }, + { + "avg_penalty/after_target": 2.757326900959015, + "avg_penalty/after_think": 2.7845945358276367, + "avg_penalty/before_target": 0.4033498801290989, + "avg_penalty/before_think": 0.5345017313957214, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 634.5, + "completions/max_terminated_length": 539.25, + "completions/mean_length": 188.34375, + "completions/mean_terminated_length": 175.45104598999023, + "completions/min_length": 35.25, + "completions/min_terminated_length": 35.25, + "epoch": 0.616, + "grad_norm": 6.707781791687012, + "kl": 24.5859375, + "learning_rate": 7.733486925631448e-06, + "loss": 1.9978, + "num_tokens": 37583084.0, + "reward": 1.51171875, + "reward_std": 0.8387131541967392, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4308478757739067, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.40927109122276306, + "step": 1232, + "token_counts/after_target": 702.0, + "token_counts/after_think": 56.5, + "token_counts/before_target": 1640.75, + "token_counts/before_think": 614.25 + }, + { + "avg_penalty/after_target": 2.0856142044067383, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.34854958206415176, + "avg_penalty/before_think": 0.38761887699365616, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 634.5, + "completions/max_terminated_length": 463.75, + "completions/mean_length": 171.078125, + "completions/mean_terminated_length": 157.02083587646484, + "completions/min_length": 25.25, + "completions/min_terminated_length": 25.25, + "epoch": 0.6165, + "grad_norm": 12.728774070739746, + "kl": 27.265625, + "learning_rate": 7.716491298893443e-06, + "loss": 1.9331, + "num_tokens": 37604209.0, + "reward": 1.51953125, + "reward_std": 0.8405971974134445, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4308478757739067, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.41866695135831833, + "step": 1233, + "token_counts/after_target": 322.0, + "token_counts/after_think": 39.0, + "token_counts/before_target": 1699.25, + "token_counts/before_think": 677.0 + }, + { + "avg_penalty/after_target": 2.6909310817718506, + "avg_penalty/after_think": 3.7217600345611572, + "avg_penalty/before_target": 0.3493993952870369, + "avg_penalty/before_think": 0.37578408420085907, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 617.5, + "completions/max_terminated_length": 506.25, + "completions/mean_length": 189.40625, + "completions/mean_terminated_length": 177.21771240234375, + "completions/min_length": 50.75, + "completions/min_terminated_length": 50.75, + "epoch": 0.617, + "grad_norm": 13.997139930725098, + "kl": 31.0, + "learning_rate": 7.699502628118958e-06, + "loss": 2.2072, + "num_tokens": 37629755.0, + "reward": 1.3984375, + "reward_std": 0.8985808938741684, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4613594636321068, + "rewards/tag_count_reward/mean": 0.7109375, + "rewards/tag_count_reward/std": 0.4415757656097412, + "step": 1234, + "token_counts/after_target": 553.25, + "token_counts/after_think": 19.25, + "token_counts/before_target": 2023.75, + "token_counts/before_think": 434.25 + }, + { + "avg_penalty/after_target": 2.117032289505005, + "avg_penalty/after_think": 2.6438037753105164, + "avg_penalty/before_target": 0.4173549599945545, + "avg_penalty/before_think": 0.37351270765066147, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 641.75, + "completions/max_terminated_length": 580.25, + "completions/mean_length": 193.828125, + "completions/mean_terminated_length": 183.1281280517578, + "completions/min_length": 42.75, + "completions/min_terminated_length": 42.75, + "epoch": 0.6175, + "grad_norm": 3.707928419113159, + "kl": 22.0390625, + "learning_rate": 7.68252096505843e-06, + "loss": 1.8936, + "num_tokens": 37652720.0, + "reward": 1.578125, + "reward_std": 0.7950502783060074, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4075859263539314, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.39656074345111847, + "step": 1235, + "token_counts/after_target": 775.75, + "token_counts/after_think": 28.5, + "token_counts/before_target": 1754.0, + "token_counts/before_think": 543.0 + }, + { + "avg_penalty/after_target": 2.9232256412506104, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4775807820260525, + "avg_penalty/before_think": 0.3847716748714447, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.0, + "completions/max_terminated_length": 601.0, + "completions/mean_length": 169.359375, + "completions/mean_terminated_length": 169.359375, + "completions/min_length": 42.25, + "completions/min_terminated_length": 42.25, + "epoch": 0.618, + "grad_norm": 8.720147132873535, + "kl": 15.3349609375, + "learning_rate": 7.66554636144095e-06, + "loss": 1.7391, + "num_tokens": 37672599.0, + "reward": 1.78125, + "reward_std": 0.5200984627008438, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.2640564441680908, + "rewards/tag_count_reward/mean": 0.890625, + "rewards/tag_count_reward/std": 0.257498100399971, + "step": 1236, + "token_counts/after_target": 672.0, + "token_counts/after_think": 39.5, + "token_counts/before_target": 1067.75, + "token_counts/before_think": 930.5 + }, + { + "avg_penalty/after_target": 2.8289636075496674, + "avg_penalty/after_think": 3.6847257018089294, + "avg_penalty/before_target": 0.38610734790563583, + "avg_penalty/before_think": 0.5288612470030785, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 178.296875, + "completions/mean_terminated_length": 178.296875, + "completions/min_length": 52.75, + "completions/min_terminated_length": 52.75, + "epoch": 0.6185, + "grad_norm": 4.469701290130615, + "kl": 15.546875, + "learning_rate": 7.6485788689741e-06, + "loss": 1.3369, + "num_tokens": 37696122.0, + "reward": 1.5859375, + "reward_std": 0.7819046378135681, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.3717194125056267, + "step": 1237, + "token_counts/after_target": 462.25, + "token_counts/after_think": 97.5, + "token_counts/before_target": 1524.25, + "token_counts/before_think": 768.75 + }, + { + "avg_penalty/after_target": 2.5310892462730408, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3784763626754284, + "avg_penalty/before_think": 0.4416182115674019, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.25, + "completions/max_terminated_length": 520.25, + "completions/mean_length": 195.875, + "completions/mean_terminated_length": 195.875, + "completions/min_length": 49.75, + "completions/min_terminated_length": 49.75, + "epoch": 0.619, + "grad_norm": 2.376042366027832, + "kl": 12.4609375, + "learning_rate": 7.631618539343815e-06, + "loss": 1.1448, + "num_tokens": 37718370.0, + "reward": 1.66015625, + "reward_std": 0.7675025314092636, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38772592693567276, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.38026267290115356, + "step": 1238, + "token_counts/after_target": 574.0, + "token_counts/after_think": 42.0, + "token_counts/before_target": 1786.5, + "token_counts/before_think": 731.5 + }, + { + "avg_penalty/after_target": 2.3783918619155884, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.3498455882072449, + "avg_penalty/before_think": 0.5472446754574776, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.25, + "completions/max_terminated_length": 551.25, + "completions/mean_length": 245.765625, + "completions/mean_terminated_length": 245.765625, + "completions/min_length": 38.75, + "completions/min_terminated_length": 38.75, + "epoch": 0.6195, + "grad_norm": 4.579905033111572, + "kl": 12.23681640625, + "learning_rate": 7.6146654242141935e-06, + "loss": 1.0741, + "num_tokens": 37750563.0, + "reward": 1.63671875, + "reward_std": 0.5454893559217453, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.2759781554341316, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.2738640382885933, + "step": 1239, + "token_counts/after_target": 748.5, + "token_counts/after_think": 12.5, + "token_counts/before_target": 2412.0, + "token_counts/before_think": 759.25 + }, + { + "avg_penalty/after_target": 3.052797704935074, + "avg_penalty/after_think": 3.8935548663139343, + "avg_penalty/before_target": 0.4261104352772236, + "avg_penalty/before_think": 0.5315608829259872, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.75, + "completions/max_terminated_length": 496.75, + "completions/mean_length": 155.46875, + "completions/mean_terminated_length": 155.46875, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.62, + "grad_norm": 5.020211219787598, + "kl": 15.4375, + "learning_rate": 7.597719575227364e-06, + "loss": 1.6105, + "num_tokens": 37768929.0, + "reward": 1.74609375, + "reward_std": 0.6926862448453903, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36797718703746796, + "rewards/tag_count_reward/mean": 0.87109375, + "rewards/tag_count_reward/std": 0.2982109263539314, + "step": 1240, + "token_counts/after_target": 485.75, + "token_counts/after_think": 63.0, + "token_counts/before_target": 1211.5, + "token_counts/before_think": 727.25 + }, + { + "avg_penalty/after_target": 2.5050180852413177, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3418091796338558, + "avg_penalty/before_think": 0.48298969864845276, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.25, + "completions/max_terminated_length": 446.25, + "completions/mean_length": 164.796875, + "completions/mean_terminated_length": 164.796875, + "completions/min_length": 59.75, + "completions/min_terminated_length": 59.75, + "epoch": 0.6205, + "grad_norm": 3.5109176635742188, + "kl": 15.953125, + "learning_rate": 7.580781044003324e-06, + "loss": 1.4812, + "num_tokens": 37792052.0, + "reward": 1.703125, + "reward_std": 0.6892103254795074, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.3604728877544403, + "rewards/tag_count_reward/mean": 0.859375, + "rewards/tag_count_reward/std": 0.3330090567469597, + "step": 1241, + "token_counts/after_target": 416.0, + "token_counts/after_think": 47.5, + "token_counts/before_target": 1545.25, + "token_counts/before_think": 628.0 + }, + { + "avg_penalty/after_target": 3.0599194169044495, + "avg_penalty/after_think": 2.648476541042328, + "avg_penalty/before_target": 0.28085074201226234, + "avg_penalty/before_think": 0.40442850440740585, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 606.25, + "completions/max_terminated_length": 606.25, + "completions/mean_length": 199.0, + "completions/mean_terminated_length": 199.0, + "completions/min_length": 22.25, + "completions/min_terminated_length": 22.25, + "epoch": 0.621, + "grad_norm": 4.163013935089111, + "kl": 15.76171875, + "learning_rate": 7.5638498821397755e-06, + "loss": 1.298, + "num_tokens": 37814276.0, + "reward": 1.58203125, + "reward_std": 0.7811354249715805, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4101393073797226, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.37762612849473953, + "step": 1242, + "token_counts/after_target": 397.25, + "token_counts/after_think": 70.0, + "token_counts/before_target": 1780.5, + "token_counts/before_think": 936.25 + }, + { + "avg_penalty/after_target": 2.3768274784088135, + "avg_penalty/after_think": 2.9870590567588806, + "avg_penalty/before_target": 0.30992911756038666, + "avg_penalty/before_think": 0.4414778910577297, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.5, + "completions/max_terminated_length": 461.5, + "completions/mean_length": 157.640625, + "completions/mean_terminated_length": 157.640625, + "completions/min_length": 30.5, + "completions/min_terminated_length": 30.5, + "epoch": 0.6215, + "grad_norm": 4.10118293762207, + "kl": 11.3671875, + "learning_rate": 7.546926141211975e-06, + "loss": 1.1036, + "num_tokens": 37835373.0, + "reward": 1.6328125, + "reward_std": 0.7392726391553879, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.39476002007722855, + "rewards/tag_count_reward/mean": 0.8359375, + "rewards/tag_count_reward/std": 0.3623364716768265, + "step": 1243, + "token_counts/after_target": 442.75, + "token_counts/after_think": 97.25, + "token_counts/before_target": 1115.0, + "token_counts/before_think": 867.25 + }, + { + "avg_penalty/after_target": 2.6964060366153717, + "avg_penalty/after_think": 3.723109006881714, + "avg_penalty/before_target": 0.26874153688549995, + "avg_penalty/before_think": 0.5678831860423088, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 184.15625, + "completions/mean_terminated_length": 184.15625, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.622, + "grad_norm": 4.375447750091553, + "kl": 9.767578125, + "learning_rate": 7.530009872772572e-06, + "loss": 0.9558, + "num_tokens": 37857047.0, + "reward": 1.68359375, + "reward_std": 0.7037475854158401, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.3758598491549492, + "rewards/tag_count_reward/mean": 0.85546875, + "rewards/tag_count_reward/std": 0.33581091463565826, + "step": 1244, + "token_counts/after_target": 366.25, + "token_counts/after_think": 135.25, + "token_counts/before_target": 1818.25, + "token_counts/before_think": 626.75 + }, + { + "avg_penalty/after_target": 2.2549671828746796, + "avg_penalty/after_think": 2.9679174423217773, + "avg_penalty/before_target": 0.28571294993162155, + "avg_penalty/before_think": 0.4204663932323456, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.5, + "completions/max_terminated_length": 378.5, + "completions/mean_length": 146.859375, + "completions/mean_terminated_length": 146.859375, + "completions/min_length": 39.5, + "completions/min_terminated_length": 39.5, + "epoch": 0.6225, + "grad_norm": 9.101594924926758, + "kl": 20.4375, + "learning_rate": 7.513101128351454e-06, + "loss": 1.3993, + "num_tokens": 37874670.0, + "reward": 1.421875, + "reward_std": 0.9195376485586166, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4682852029800415, + "rewards/tag_count_reward/mean": 0.71875, + "rewards/tag_count_reward/std": 0.45617998391389847, + "step": 1245, + "token_counts/after_target": 259.25, + "token_counts/after_think": 43.0, + "token_counts/before_target": 1389.5, + "token_counts/before_think": 658.0 + }, + { + "avg_penalty/after_target": 2.506894290447235, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4089883305132389, + "avg_penalty/before_think": 0.4956127405166626, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.25, + "completions/max_terminated_length": 373.25, + "completions/mean_length": 158.1875, + "completions/mean_terminated_length": 158.1875, + "completions/min_length": 40.75, + "completions/min_terminated_length": 40.75, + "epoch": 0.623, + "grad_norm": 4.510664463043213, + "kl": 14.703125, + "learning_rate": 7.496199959455584e-06, + "loss": 1.3721, + "num_tokens": 37893290.0, + "reward": 1.64453125, + "reward_std": 0.7642461508512497, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3987511098384857, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.3693338632583618, + "step": 1246, + "token_counts/after_target": 529.25, + "token_counts/after_think": 32.75, + "token_counts/before_target": 1383.5, + "token_counts/before_think": 585.5 + }, + { + "avg_penalty/after_target": 2.6640065908432007, + "avg_penalty/after_think": 3.7688368558883667, + "avg_penalty/before_target": 0.2942775599658489, + "avg_penalty/before_think": 0.4672319293022156, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 168.046875, + "completions/mean_terminated_length": 168.046875, + "completions/min_length": 47.25, + "completions/min_terminated_length": 47.25, + "epoch": 0.6235, + "grad_norm": 2.926957607269287, + "kl": 12.84375, + "learning_rate": 7.4793064175688635e-06, + "loss": 1.1569, + "num_tokens": 37915613.0, + "reward": 1.62890625, + "reward_std": 0.7895528823137283, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3987511098384857, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.3912878558039665, + "step": 1247, + "token_counts/after_target": 267.5, + "token_counts/after_think": 70.5, + "token_counts/before_target": 1567.5, + "token_counts/before_think": 783.25 + }, + { + "avg_penalty/after_target": 2.6544929146766663, + "avg_penalty/after_think": 2.7081162333488464, + "avg_penalty/before_target": 0.4044003374874592, + "avg_penalty/before_think": 0.5997156649827957, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 570.25, + "completions/max_terminated_length": 457.25, + "completions/mean_length": 207.34375, + "completions/mean_terminated_length": 194.5093765258789, + "completions/min_length": 42.25, + "completions/min_terminated_length": 42.25, + "epoch": 0.624, + "grad_norm": 2.921787977218628, + "kl": 18.015625, + "learning_rate": 7.462420554151945e-06, + "loss": 1.5735, + "num_tokens": 37939859.0, + "reward": 1.4609375, + "reward_std": 0.8627976924180984, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4682852029800415, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.4137861877679825, + "step": 1248, + "token_counts/after_target": 755.5, + "token_counts/after_think": 24.5, + "token_counts/before_target": 1609.25, + "token_counts/before_think": 928.25 + }, + { + "avg_penalty/after_target": 1.5892398655414581, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3569462075829506, + "avg_penalty/before_think": 0.5373412147164345, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 578.25, + "completions/max_terminated_length": 578.25, + "completions/mean_length": 243.578125, + "completions/mean_terminated_length": 243.578125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.6245, + "grad_norm": 8.99039077758789, + "kl": 16.59375, + "learning_rate": 7.445542420642097e-06, + "loss": 1.144, + "num_tokens": 37965144.0, + "reward": 1.47265625, + "reward_std": 0.832858681678772, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4682852029800415, + "rewards/tag_count_reward/mean": 0.75390625, + "rewards/tag_count_reward/std": 0.4162333905696869, + "step": 1249, + "token_counts/after_target": 462.0, + "token_counts/after_think": 30.25, + "token_counts/before_target": 1982.25, + "token_counts/before_think": 1422.75 + }, + { + "avg_penalty/after_target": 2.8291942477226257, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3226822130382061, + "avg_penalty/before_think": 0.5338527411222458, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.25, + "completions/max_terminated_length": 568.25, + "completions/mean_length": 198.640625, + "completions/mean_terminated_length": 198.640625, + "completions/min_length": 64.5, + "completions/min_terminated_length": 64.5, + "epoch": 0.625, + "grad_norm": 4.600222110748291, + "kl": 14.9765625, + "learning_rate": 7.428672068453041e-06, + "loss": 1.4084, + "num_tokens": 37987777.0, + "reward": 1.59375, + "reward_std": 0.7037722617387772, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.38879410922527313, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.3362013250589371, + "step": 1250, + "token_counts/after_target": 580.75, + "token_counts/after_think": 50.25, + "token_counts/before_target": 1688.5, + "token_counts/before_think": 858.75 + }, + { + "avg_penalty/after_target": 2.152832955121994, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4311966225504875, + "avg_penalty/before_think": 0.600401371717453, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.25, + "completions/max_terminated_length": 520.25, + "completions/mean_length": 208.140625, + "completions/mean_terminated_length": 208.140625, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.6255, + "grad_norm": 6.360876083374023, + "kl": 15.671875, + "learning_rate": 7.411809548974792e-06, + "loss": 1.4738, + "num_tokens": 38016522.0, + "reward": 1.67578125, + "reward_std": 0.7031493782997131, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.37149807065725327, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.34761860221624374, + "step": 1251, + "token_counts/after_target": 606.25, + "token_counts/after_think": 140.0, + "token_counts/before_target": 1766.25, + "token_counts/before_think": 817.75 + }, + { + "avg_penalty/after_target": 2.603970855474472, + "avg_penalty/after_think": 2.362124741077423, + "avg_penalty/before_target": 0.3611665964126587, + "avg_penalty/before_think": 0.5390426814556122, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 552.25, + "completions/max_terminated_length": 552.25, + "completions/mean_length": 225.984375, + "completions/mean_terminated_length": 225.984375, + "completions/min_length": 55.5, + "completions/min_terminated_length": 55.5, + "epoch": 0.626, + "grad_norm": 6.549646377563477, + "kl": 14.453125, + "learning_rate": 7.394954913573517e-06, + "loss": 1.4385, + "num_tokens": 38039225.0, + "reward": 1.546875, + "reward_std": 0.7845748364925385, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4383598491549492, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.34862352162599564, + "step": 1252, + "token_counts/after_target": 663.0, + "token_counts/after_think": 56.0, + "token_counts/before_target": 1615.75, + "token_counts/before_think": 1281.0 + }, + { + "avg_penalty/after_target": 2.401067227125168, + "avg_penalty/after_think": 2.9108853340148926, + "avg_penalty/before_target": 0.4446977600455284, + "avg_penalty/before_think": 0.536383867263794, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 658.0, + "completions/max_terminated_length": 510.5, + "completions/mean_length": 239.109375, + "completions/mean_terminated_length": 226.45833587646484, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.6265, + "grad_norm": 5.392938137054443, + "kl": 21.328125, + "learning_rate": 7.378108213591355e-06, + "loss": 1.6789, + "num_tokens": 38067728.0, + "reward": 1.30859375, + "reward_std": 0.8750933557748795, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.5061737895011902, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.4156179055571556, + "step": 1253, + "token_counts/after_target": 817.0, + "token_counts/after_think": 54.75, + "token_counts/before_target": 1966.0, + "token_counts/before_think": 988.0 + }, + { + "avg_penalty/after_target": 2.9607914984226227, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.30029579624533653, + "avg_penalty/before_think": 0.3665614426136017, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.25, + "completions/max_terminated_length": 483.25, + "completions/mean_length": 183.25, + "completions/mean_terminated_length": 183.25, + "completions/min_length": 35.5, + "completions/min_terminated_length": 35.5, + "epoch": 0.627, + "grad_norm": 5.1633758544921875, + "kl": 21.0, + "learning_rate": 7.361269500346274e-06, + "loss": 1.8625, + "num_tokens": 38089008.0, + "reward": 1.43359375, + "reward_std": 0.8395128697156906, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4713720977306366, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.4019065350294113, + "step": 1254, + "token_counts/after_target": 632.25, + "token_counts/after_think": 31.25, + "token_counts/before_target": 1401.75, + "token_counts/before_think": 866.75 + }, + { + "avg_penalty/after_target": 2.0846745669841766, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.45742492377758026, + "avg_penalty/before_think": 0.3380098417401314, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.75, + "completions/max_terminated_length": 558.75, + "completions/mean_length": 227.609375, + "completions/mean_terminated_length": 227.609375, + "completions/min_length": 52.5, + "completions/min_terminated_length": 52.5, + "epoch": 0.6275, + "grad_norm": 10.237743377685547, + "kl": 27.90625, + "learning_rate": 7.344438825131912e-06, + "loss": 2.0871, + "num_tokens": 38111831.0, + "reward": 1.328125, + "reward_std": 0.9260212779045105, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.4955305755138397, + "rewards/tag_count_reward/mean": 0.6875, + "rewards/tag_count_reward/std": 0.43171579390764236, + "step": 1255, + "token_counts/after_target": 810.0, + "token_counts/after_think": 158.25, + "token_counts/before_target": 1732.75, + "token_counts/before_think": 940.75 + }, + { + "avg_penalty/after_target": 1.7446274757385254, + "avg_penalty/after_think": 3.762124240398407, + "avg_penalty/before_target": 0.43581684678792953, + "avg_penalty/before_think": 0.5965782850980759, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 734.5, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 308.09375, + "completions/mean_terminated_length": 295.6208381652832, + "completions/min_length": 51.75, + "completions/min_terminated_length": 51.75, + "epoch": 0.628, + "grad_norm": 11.845428466796875, + "kl": 24.84375, + "learning_rate": 7.327616239217432e-06, + "loss": 1.7804, + "num_tokens": 38142061.0, + "reward": 1.359375, + "reward_std": 0.8774421960115433, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.4876555874943733, + "rewards/tag_count_reward/mean": 0.71875, + "rewards/tag_count_reward/std": 0.40921246632933617, + "step": 1256, + "token_counts/after_target": 1043.25, + "token_counts/after_think": 109.75, + "token_counts/before_target": 2902.25, + "token_counts/before_think": 874.25 + }, + { + "avg_penalty/after_target": 2.116474539041519, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.38005381077528, + "avg_penalty/before_think": 0.779451921582222, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 783.25, + "completions/max_terminated_length": 783.25, + "completions/mean_length": 322.515625, + "completions/mean_terminated_length": 322.515625, + "completions/min_length": 56.25, + "completions/min_terminated_length": 56.25, + "epoch": 0.6285, + "grad_norm": 9.420414924621582, + "kl": 23.09375, + "learning_rate": 7.310801793847344e-06, + "loss": 1.7261, + "num_tokens": 38170110.0, + "reward": 1.19140625, + "reward_std": 0.8540397137403488, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.515625, + "rewards/format_reward/std": 0.48079314827919006, + "rewards/tag_count_reward/mean": 0.67578125, + "rewards/tag_count_reward/std": 0.42213737964630127, + "step": 1257, + "token_counts/after_target": 960.75, + "token_counts/after_think": 79.0, + "token_counts/before_target": 2545.75, + "token_counts/before_think": 1574.75 + }, + { + "avg_penalty/after_target": 2.3272188901901245, + "avg_penalty/after_think": 2.9670222997665405, + "avg_penalty/before_target": 0.43760591000318527, + "avg_penalty/before_think": 0.5710088238120079, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 811.75, + "completions/max_terminated_length": 713.0, + "completions/mean_length": 359.71875, + "completions/mean_terminated_length": 349.4406280517578, + "completions/min_length": 87.25, + "completions/min_terminated_length": 87.25, + "epoch": 0.629, + "grad_norm": 10.303540229797363, + "kl": 26.34375, + "learning_rate": 7.2939955402413666e-06, + "loss": 1.9695, + "num_tokens": 38202092.0, + "reward": 1.23828125, + "reward_std": 0.9079888314008713, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.5625, + "rewards/format_reward/std": 0.5102732330560684, + "rewards/tag_count_reward/mean": 0.67578125, + "rewards/tag_count_reward/std": 0.43031851202249527, + "step": 1258, + "token_counts/after_target": 1285.0, + "token_counts/after_think": 50.75, + "token_counts/before_target": 3141.5, + "token_counts/before_think": 1278.25 + }, + { + "avg_penalty/after_target": 2.3129091262817383, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.48088952898979187, + "avg_penalty/before_think": 0.5923145785927773, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 781.5, + "completions/max_terminated_length": 739.75, + "completions/mean_length": 325.921875, + "completions/mean_terminated_length": 317.25938415527344, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.6295, + "grad_norm": 9.097434043884277, + "kl": 28.9375, + "learning_rate": 7.277197529594257e-06, + "loss": 2.1655, + "num_tokens": 38236247.0, + "reward": 1.125, + "reward_std": 0.9123812615871429, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.515625, + "rewards/format_reward/std": 0.48989029973745346, + "rewards/tag_count_reward/mean": 0.609375, + "rewards/tag_count_reward/std": 0.45303045213222504, + "step": 1259, + "token_counts/after_target": 1649.75, + "token_counts/after_think": 34.0, + "token_counts/before_target": 2888.25, + "token_counts/before_think": 642.75 + }, + { + "avg_penalty/after_target": 2.8090383410453796, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.46633434295654297, + "avg_penalty/before_think": 0.6117408536374569, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 720.75, + "completions/max_terminated_length": 720.75, + "completions/mean_length": 308.34375, + "completions/mean_terminated_length": 308.34375, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.63, + "grad_norm": 7.188309192657471, + "kl": 18.234375, + "learning_rate": 7.260407813075676e-06, + "loss": 1.7648, + "num_tokens": 38268605.0, + "reward": 1.51953125, + "reward_std": 0.8260273635387421, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.45508860796689987, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.39426497370004654, + "step": 1260, + "token_counts/after_target": 1352.0, + "token_counts/after_think": 132.25, + "token_counts/before_target": 2314.0, + "token_counts/before_think": 1135.25 + }, + { + "avg_penalty/after_target": 2.6491923332214355, + "avg_penalty/after_think": 2.375649720430374, + "avg_penalty/before_target": 0.4434083253145218, + "avg_penalty/before_think": 0.5158529169857502, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.25, + "completions/max_terminated_length": 601.25, + "completions/mean_length": 299.5, + "completions/mean_terminated_length": 299.5, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.6305, + "grad_norm": 3.3445639610290527, + "kl": 20.515625, + "learning_rate": 7.243626441830009e-06, + "loss": 1.7243, + "num_tokens": 38298637.0, + "reward": 1.35546875, + "reward_std": 0.9000040739774704, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.48025963455438614, + "rewards/tag_count_reward/mean": 0.69921875, + "rewards/tag_count_reward/std": 0.4370359554886818, + "step": 1261, + "token_counts/after_target": 1214.5, + "token_counts/after_think": 24.0, + "token_counts/before_target": 2821.0, + "token_counts/before_think": 732.5 + }, + { + "avg_penalty/after_target": 2.255633443593979, + "avg_penalty/after_think": 2.858137607574463, + "avg_penalty/before_target": 0.3887055031955242, + "avg_penalty/before_think": 0.4348254054784775, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 179.515625, + "completions/mean_terminated_length": 179.515625, + "completions/min_length": 44.5, + "completions/min_terminated_length": 44.5, + "epoch": 0.631, + "grad_norm": 8.324063301086426, + "kl": 15.734375, + "learning_rate": 7.226853466976222e-06, + "loss": 1.5784, + "num_tokens": 38320670.0, + "reward": 1.59375, + "reward_std": 0.7536016702651978, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.43303824216127396, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.34499504417181015, + "step": 1262, + "token_counts/after_target": 412.75, + "token_counts/after_think": 25.75, + "token_counts/before_target": 1508.75, + "token_counts/before_think": 925.0 + }, + { + "avg_penalty/after_target": 1.765894889831543, + "avg_penalty/after_think": 3.7597039341926575, + "avg_penalty/before_target": 0.3783535696566105, + "avg_penalty/before_think": 0.4865832030773163, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 619.0, + "completions/max_terminated_length": 619.0, + "completions/mean_length": 255.765625, + "completions/mean_terminated_length": 255.765625, + "completions/min_length": 62.5, + "completions/min_terminated_length": 62.5, + "epoch": 0.6315, + "grad_norm": 4.754895210266113, + "kl": 17.140625, + "learning_rate": 7.210088939607709e-06, + "loss": 1.5946, + "num_tokens": 38346623.0, + "reward": 1.5, + "reward_std": 0.8311598151922226, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4598134011030197, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.3929530903697014, + "step": 1263, + "token_counts/after_target": 659.0, + "token_counts/after_think": 181.5, + "token_counts/before_target": 2417.75, + "token_counts/before_think": 834.0 + }, + { + "avg_penalty/after_target": 2.6351794600486755, + "avg_penalty/after_think": 3.975628972053528, + "avg_penalty/before_target": 0.33414769917726517, + "avg_penalty/before_think": 0.6454251259565353, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 619.5, + "completions/max_terminated_length": 546.75, + "completions/mean_length": 271.484375, + "completions/mean_terminated_length": 261.4635467529297, + "completions/min_length": 58.5, + "completions/min_terminated_length": 58.5, + "epoch": 0.632, + "grad_norm": 12.608966827392578, + "kl": 12.328125, + "learning_rate": 7.1933329107921244e-06, + "loss": 1.4478, + "num_tokens": 38373214.0, + "reward": 1.63671875, + "reward_std": 0.7457249909639359, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.83984375, + "rewards/tag_count_reward/std": 0.3548450991511345, + "step": 1264, + "token_counts/after_target": 599.0, + "token_counts/after_think": 353.25, + "token_counts/before_target": 2147.0, + "token_counts/before_think": 1244.5 + }, + { + "avg_penalty/after_target": 2.4679220616817474, + "avg_penalty/after_think": 2.838565468788147, + "avg_penalty/before_target": 0.43310701847076416, + "avg_penalty/before_think": 0.7689275667071342, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 676.25, + "completions/max_terminated_length": 676.25, + "completions/mean_length": 289.171875, + "completions/mean_terminated_length": 289.171875, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.6325, + "grad_norm": 11.075374603271484, + "kl": 13.8515625, + "learning_rate": 7.176585431571235e-06, + "loss": 1.5598, + "num_tokens": 38401129.0, + "reward": 1.54296875, + "reward_std": 0.6997622549533844, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.42206869274377823, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.3015919700264931, + "step": 1265, + "token_counts/after_target": 1190.0, + "token_counts/after_think": 197.75, + "token_counts/before_target": 2013.75, + "token_counts/before_think": 1225.25 + }, + { + "avg_penalty/after_target": 2.150837689638138, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.42139770835638046, + "avg_penalty/before_think": 0.53964464366436, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 602.75, + "completions/max_terminated_length": 503.5, + "completions/mean_length": 252.875, + "completions/mean_terminated_length": 241.61771392822266, + "completions/min_length": 36.75, + "completions/min_terminated_length": 36.75, + "epoch": 0.633, + "grad_norm": 5.335844039916992, + "kl": 23.265625, + "learning_rate": 7.159846552960774e-06, + "loss": 1.8311, + "num_tokens": 38426497.0, + "reward": 1.2578125, + "reward_std": 0.9462847113609314, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.4939897432923317, + "rewards/tag_count_reward/mean": 0.6484375, + "rewards/tag_count_reward/std": 0.4714300408959389, + "step": 1266, + "token_counts/after_target": 955.5, + "token_counts/after_think": 32.75, + "token_counts/before_target": 2339.0, + "token_counts/before_think": 718.75 + }, + { + "avg_penalty/after_target": 2.651654362678528, + "avg_penalty/after_think": 1.4319714307785034, + "avg_penalty/before_target": 0.4071025997400284, + "avg_penalty/before_think": 0.3328841030597687, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 672.0, + "completions/max_terminated_length": 654.5, + "completions/mean_length": 276.265625, + "completions/mean_terminated_length": 266.85313415527344, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.6335, + "grad_norm": 6.184483528137207, + "kl": 21.21875, + "learning_rate": 7.143116325950266e-06, + "loss": 1.9481, + "num_tokens": 38454706.0, + "reward": 1.47265625, + "reward_std": 0.852198526263237, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.43217839300632477, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.4071761965751648, + "step": 1267, + "token_counts/after_target": 1121.75, + "token_counts/after_think": 46.0, + "token_counts/before_target": 2269.5, + "token_counts/before_think": 983.0 + }, + { + "avg_penalty/after_target": 2.2034782767295837, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.43550993502140045, + "avg_penalty/before_think": 0.5494263544678688, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 694.0, + "completions/max_terminated_length": 660.5, + "completions/mean_length": 302.90625, + "completions/mean_terminated_length": 291.8062515258789, + "completions/min_length": 54.75, + "completions/min_terminated_length": 54.75, + "epoch": 0.634, + "grad_norm": 3.906202793121338, + "kl": 14.453125, + "learning_rate": 7.126394801502883e-06, + "loss": 1.3774, + "num_tokens": 38483628.0, + "reward": 1.6171875, + "reward_std": 0.7310252338647842, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.42516325414180756, + "rewards/tag_count_reward/mean": 0.8359375, + "rewards/tag_count_reward/std": 0.331196166574955, + "step": 1268, + "token_counts/after_target": 923.0, + "token_counts/after_think": 18.0, + "token_counts/before_target": 2287.0, + "token_counts/before_think": 1618.5 + }, + { + "avg_penalty/after_target": 2.9557881355285645, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.35520368069410324, + "avg_penalty/before_think": 0.3989271707832813, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 205.953125, + "completions/mean_terminated_length": 205.953125, + "completions/min_length": 31.25, + "completions/min_terminated_length": 31.25, + "epoch": 0.6345, + "grad_norm": 3.795661211013794, + "kl": 15.5625, + "learning_rate": 7.109682030555283e-06, + "loss": 1.3735, + "num_tokens": 38506985.0, + "reward": 1.53125, + "reward_std": 0.7903959304094315, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4260597825050354, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.3802092894911766, + "step": 1269, + "token_counts/after_target": 438.25, + "token_counts/after_think": 36.0, + "token_counts/before_target": 1892.5, + "token_counts/before_think": 928.5 + }, + { + "avg_penalty/after_target": 1.9829229414463043, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5097835138440132, + "avg_penalty/before_think": 0.585406094789505, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 826.25, + "completions/max_terminated_length": 786.75, + "completions/mean_length": 326.875, + "completions/mean_terminated_length": 315.37291717529297, + "completions/min_length": 72.25, + "completions/min_terminated_length": 72.25, + "epoch": 0.635, + "grad_norm": 3.9703736305236816, + "kl": 18.671875, + "learning_rate": 7.092978064017475e-06, + "loss": 1.6822, + "num_tokens": 38536465.0, + "reward": 1.52734375, + "reward_std": 0.8083615303039551, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44938503205776215, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.37535910308361053, + "step": 1270, + "token_counts/after_target": 1179.5, + "token_counts/after_think": 152.75, + "token_counts/before_target": 2609.0, + "token_counts/before_think": 1288.75 + }, + { + "avg_penalty/after_target": 1.9772995114326477, + "avg_penalty/after_think": 3.791216552257538, + "avg_penalty/before_target": 0.520549476146698, + "avg_penalty/before_think": 0.6308973878622055, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 763.5, + "completions/max_terminated_length": 739.0, + "completions/mean_length": 298.84375, + "completions/mean_terminated_length": 286.72500228881836, + "completions/min_length": 57.75, + "completions/min_terminated_length": 57.75, + "epoch": 0.6355, + "grad_norm": 11.32104778289795, + "kl": 14.203125, + "learning_rate": 7.076282952772634e-06, + "loss": 1.5657, + "num_tokens": 38566263.0, + "reward": 1.67578125, + "reward_std": 0.711866945028305, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4141380712389946, + "rewards/tag_count_reward/mean": 0.86328125, + "rewards/tag_count_reward/std": 0.30010179430246353, + "step": 1271, + "token_counts/after_target": 1011.5, + "token_counts/after_think": 185.5, + "token_counts/before_target": 2396.25, + "token_counts/before_think": 1188.25 + }, + { + "avg_penalty/after_target": 2.6055487394332886, + "avg_penalty/after_think": 1.6410603523254395, + "avg_penalty/before_target": 0.2341417632997036, + "avg_penalty/before_think": 0.3934405893087387, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.25, + "completions/max_terminated_length": 443.25, + "completions/mean_length": 175.5, + "completions/mean_terminated_length": 175.5, + "completions/min_length": 52.75, + "completions/min_terminated_length": 52.75, + "epoch": 0.636, + "grad_norm": 11.30545425415039, + "kl": 23.34375, + "learning_rate": 7.059596747676963e-06, + "loss": 1.6377, + "num_tokens": 38587623.0, + "reward": 1.3671875, + "reward_std": 0.8957725316286087, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.4788651168346405, + "rewards/tag_count_reward/mean": 0.7109375, + "rewards/tag_count_reward/std": 0.43171414732933044, + "step": 1272, + "token_counts/after_target": 344.25, + "token_counts/after_think": 21.75, + "token_counts/before_target": 1805.5, + "token_counts/before_think": 636.5 + }, + { + "avg_penalty/after_target": 2.871044874191284, + "avg_penalty/after_think": 1.551517903804779, + "avg_penalty/before_target": 0.39937999472022057, + "avg_penalty/before_think": 0.42635995894670486, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 578.25, + "completions/max_terminated_length": 578.25, + "completions/mean_length": 209.484375, + "completions/mean_terminated_length": 209.484375, + "completions/min_length": 39.75, + "completions/min_terminated_length": 39.75, + "epoch": 0.6365, + "grad_norm": 6.566179275512695, + "kl": 23.03125, + "learning_rate": 7.042919499559538e-06, + "loss": 1.7682, + "num_tokens": 38610422.0, + "reward": 1.40234375, + "reward_std": 0.8869081139564514, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.47669370472431183, + "rewards/tag_count_reward/mean": 0.73046875, + "rewards/tag_count_reward/std": 0.4282539039850235, + "step": 1273, + "token_counts/after_target": 742.0, + "token_counts/after_think": 23.5, + "token_counts/before_target": 1945.75, + "token_counts/before_think": 640.5 + }, + { + "avg_penalty/after_target": 2.9179546236991882, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.5858103558421135, + "avg_penalty/before_think": 0.5054986849427223, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 834.0, + "completions/max_terminated_length": 638.75, + "completions/mean_length": 273.953125, + "completions/mean_terminated_length": 250.10833930969238, + "completions/min_length": 34.25, + "completions/min_terminated_length": 34.25, + "epoch": 0.637, + "grad_norm": 2.703667640686035, + "kl": 23.859375, + "learning_rate": 7.026251259222141e-06, + "loss": 2.1453, + "num_tokens": 38641267.0, + "reward": 1.421875, + "reward_std": 0.877068504691124, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.46450965851545334, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.4223322793841362, + "step": 1274, + "token_counts/after_target": 1270.0, + "token_counts/after_think": 97.75, + "token_counts/before_target": 1925.0, + "token_counts/before_think": 1090.5 + }, + { + "avg_penalty/after_target": 1.9658854603767395, + "avg_penalty/after_think": 2.596571445465088, + "avg_penalty/before_target": 0.5890798717737198, + "avg_penalty/before_think": 0.6226803064346313, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 828.5, + "completions/max_terminated_length": 669.5, + "completions/mean_length": 281.875, + "completions/mean_terminated_length": 256.3656311035156, + "completions/min_length": 52.5, + "completions/min_terminated_length": 52.5, + "epoch": 0.6375, + "grad_norm": 9.686531066894531, + "kl": 29.6875, + "learning_rate": 7.009592077439135e-06, + "loss": 2.3473, + "num_tokens": 38668827.0, + "reward": 1.44140625, + "reward_std": 0.8421348035335541, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4581565484404564, + "rewards/tag_count_reward/mean": 0.75390625, + "rewards/tag_count_reward/std": 0.40620023012161255, + "step": 1275, + "token_counts/after_target": 1118.5, + "token_counts/after_think": 56.25, + "token_counts/before_target": 2647.5, + "token_counts/before_think": 687.75 + }, + { + "avg_penalty/after_target": 2.6343153715133667, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.48068687319755554, + "avg_penalty/before_think": 0.48559898883104324, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 611.25, + "completions/max_terminated_length": 571.0, + "completions/mean_length": 258.9375, + "completions/mean_terminated_length": 247.41250610351562, + "completions/min_length": 63.75, + "completions/min_terminated_length": 63.75, + "epoch": 0.638, + "grad_norm": 5.501208782196045, + "kl": 24.6875, + "learning_rate": 6.992942004957271e-06, + "loss": 2.0192, + "num_tokens": 38695975.0, + "reward": 1.49609375, + "reward_std": 0.8489070236682892, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4713720977306366, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.35817770659923553, + "step": 1276, + "token_counts/after_target": 1082.25, + "token_counts/after_think": 7.5, + "token_counts/before_target": 2309.25, + "token_counts/before_think": 744.0 + }, + { + "avg_penalty/after_target": 2.010825991630554, + "avg_penalty/after_think": 2.99669873714447, + "avg_penalty/before_target": 0.4162744879722595, + "avg_penalty/before_think": 0.5266000665724277, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 630.25, + "completions/max_terminated_length": 630.25, + "completions/mean_length": 237.296875, + "completions/mean_terminated_length": 237.296875, + "completions/min_length": 78.25, + "completions/min_terminated_length": 78.25, + "epoch": 0.6385, + "grad_norm": 4.687079906463623, + "kl": 22.71875, + "learning_rate": 6.976301092495556e-06, + "loss": 1.8601, + "num_tokens": 38722346.0, + "reward": 1.6015625, + "reward_std": 0.8182963132858276, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.42516325414180756, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.3880089297890663, + "step": 1277, + "token_counts/after_target": 799.25, + "token_counts/after_think": 60.75, + "token_counts/before_target": 2095.0, + "token_counts/before_think": 841.75 + }, + { + "avg_penalty/after_target": 2.19280081987381, + "avg_penalty/after_think": 1.8815212845802307, + "avg_penalty/before_target": 0.49593379348516464, + "avg_penalty/before_think": 0.410852387547493, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 669.5, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 209.8125, + "completions/mean_terminated_length": 196.49583435058594, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.639, + "grad_norm": 6.724034786224365, + "kl": 20.890625, + "learning_rate": 6.959669390745097e-06, + "loss": 1.6062, + "num_tokens": 38744686.0, + "reward": 1.53125, + "reward_std": 0.8988316208124161, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.11967839300632477, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4402689263224602, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.37397409602999687, + "step": 1278, + "token_counts/after_target": 762.25, + "token_counts/after_think": 18.25, + "token_counts/before_target": 1731.0, + "token_counts/before_think": 845.5 + }, + { + "avg_penalty/after_target": 2.4197872281074524, + "avg_penalty/after_think": 3.569480001926422, + "avg_penalty/before_target": 0.4296122118830681, + "avg_penalty/before_think": 0.749649703502655, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.75, + "completions/max_terminated_length": 618.75, + "completions/mean_length": 287.015625, + "completions/mean_terminated_length": 287.015625, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.6395, + "grad_norm": 5.764610767364502, + "kl": 12.3359375, + "learning_rate": 6.943046950368944e-06, + "loss": 1.1704, + "num_tokens": 38771631.0, + "reward": 1.5625, + "reward_std": 0.7533151507377625, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.41898179799318314, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.35146377980709076, + "step": 1279, + "token_counts/after_target": 1231.75, + "token_counts/after_think": 43.0, + "token_counts/before_target": 1738.25, + "token_counts/before_think": 1579.25 + }, + { + "avg_penalty/after_target": 2.33339262008667, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.38509947806596756, + "avg_penalty/before_think": 0.5380846261978149, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 698.25, + "completions/max_terminated_length": 698.25, + "completions/mean_length": 276.640625, + "completions/mean_terminated_length": 276.640625, + "completions/min_length": 70.25, + "completions/min_terminated_length": 70.25, + "epoch": 0.64, + "grad_norm": 9.7892484664917, + "kl": 23.171875, + "learning_rate": 6.92643382200193e-06, + "loss": 1.7482, + "num_tokens": 38799832.0, + "reward": 1.47265625, + "reward_std": 0.8254047781229019, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4550696536898613, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.39211585372686386, + "step": 1280, + "token_counts/after_target": 694.0, + "token_counts/after_think": 36.5, + "token_counts/before_target": 2580.25, + "token_counts/before_think": 1115.5 + }, + { + "avg_penalty/after_target": 1.7392074167728424, + "avg_penalty/after_think": 3.776355803012848, + "avg_penalty/before_target": 0.3965062201023102, + "avg_penalty/before_think": 0.603188157081604, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 597.25, + "completions/max_terminated_length": 597.25, + "completions/mean_length": 215.671875, + "completions/mean_terminated_length": 215.671875, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.6405, + "grad_norm": 4.158177852630615, + "kl": 13.8125, + "learning_rate": 6.909830056250527e-06, + "loss": 1.3272, + "num_tokens": 38824275.0, + "reward": 1.6484375, + "reward_std": 0.7026238441467285, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.3236655667424202, + "step": 1281, + "token_counts/after_target": 569.5, + "token_counts/after_think": 29.0, + "token_counts/before_target": 2058.75, + "token_counts/before_think": 793.5 + }, + { + "avg_penalty/after_target": 2.5993216037750244, + "avg_penalty/after_think": 3.964602768421173, + "avg_penalty/before_target": 0.5050091631710529, + "avg_penalty/before_think": 0.43408606201410294, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.0, + "completions/max_terminated_length": 618.0, + "completions/mean_length": 218.609375, + "completions/mean_terminated_length": 218.609375, + "completions/min_length": 43.75, + "completions/min_terminated_length": 43.75, + "epoch": 0.641, + "grad_norm": 3.216594934463501, + "kl": 17.640625, + "learning_rate": 6.893235703692685e-06, + "loss": 1.5746, + "num_tokens": 38846618.0, + "reward": 1.53515625, + "reward_std": 0.8721463829278946, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4493217319250107, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.3908481299877167, + "step": 1282, + "token_counts/after_target": 681.0, + "token_counts/after_think": 73.25, + "token_counts/before_target": 1554.25, + "token_counts/before_think": 1189.25 + }, + { + "avg_penalty/after_target": 3.3262664079666138, + "avg_penalty/after_think": 3.7596023082733154, + "avg_penalty/before_target": 0.269077830016613, + "avg_penalty/before_think": 0.593117892742157, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.25, + "completions/max_terminated_length": 613.25, + "completions/mean_length": 213.4375, + "completions/mean_terminated_length": 213.4375, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.6415, + "grad_norm": 5.8137736320495605, + "kl": 13.296875, + "learning_rate": 6.876650814877675e-06, + "loss": 1.3541, + "num_tokens": 38869750.0, + "reward": 1.62109375, + "reward_std": 0.7540781199932098, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4097762927412987, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.3632233291864395, + "step": 1283, + "token_counts/after_target": 468.75, + "token_counts/after_think": 146.0, + "token_counts/before_target": 1696.5, + "token_counts/before_think": 1103.75 + }, + { + "avg_penalty/after_target": 2.4364626705646515, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.43991411104798317, + "avg_penalty/before_think": 0.49029773473739624, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.5, + "completions/max_terminated_length": 593.5, + "completions/mean_length": 210.859375, + "completions/mean_terminated_length": 210.859375, + "completions/min_length": 38.75, + "completions/min_terminated_length": 38.75, + "epoch": 0.642, + "grad_norm": 6.2955145835876465, + "kl": 14.984375, + "learning_rate": 6.860075440325951e-06, + "loss": 1.5135, + "num_tokens": 38894493.0, + "reward": 1.63671875, + "reward_std": 0.7626260668039322, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3943893313407898, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.3709261491894722, + "step": 1284, + "token_counts/after_target": 739.5, + "token_counts/after_think": 21.0, + "token_counts/before_target": 1656.75, + "token_counts/before_think": 956.5 + }, + { + "avg_penalty/after_target": 2.4034699201583862, + "avg_penalty/after_think": 3.996563196182251, + "avg_penalty/before_target": 0.2991257645189762, + "avg_penalty/before_think": 0.5173903852701187, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 210.390625, + "completions/mean_terminated_length": 210.390625, + "completions/min_length": 52.5, + "completions/min_terminated_length": 52.5, + "epoch": 0.6425, + "grad_norm": 3.5107803344726562, + "kl": 10.9375, + "learning_rate": 6.843509630528977e-06, + "loss": 1.0614, + "num_tokens": 38916438.0, + "reward": 1.66796875, + "reward_std": 0.7022461593151093, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3943893313407898, + "rewards/tag_count_reward/mean": 0.85546875, + "rewards/tag_count_reward/std": 0.33402015268802643, + "step": 1285, + "token_counts/after_target": 401.0, + "token_counts/after_think": 147.0, + "token_counts/before_target": 1834.0, + "token_counts/before_think": 984.25 + }, + { + "avg_penalty/after_target": 1.7958011627197266, + "avg_penalty/after_think": 3.7119104862213135, + "avg_penalty/before_target": 0.5086958482861519, + "avg_penalty/before_think": 0.6599966064095497, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 240.640625, + "completions/mean_terminated_length": 240.640625, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.643, + "grad_norm": 2.63993501663208, + "kl": 16.953125, + "learning_rate": 6.826953435949081e-06, + "loss": 1.4561, + "num_tokens": 38943487.0, + "reward": 1.49609375, + "reward_std": 0.8436584323644638, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.46296359598636627, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.40008771419525146, + "step": 1286, + "token_counts/after_target": 748.5, + "token_counts/after_think": 194.5, + "token_counts/before_target": 2181.25, + "token_counts/before_think": 726.0 + }, + { + "avg_penalty/after_target": 2.6663957238197327, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.41160327941179276, + "avg_penalty/before_think": 0.5859745442867279, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 262.59375, + "completions/mean_terminated_length": 262.59375, + "completions/min_length": 47.25, + "completions/min_terminated_length": 47.25, + "epoch": 0.6435, + "grad_norm": 3.0927343368530273, + "kl": 19.03125, + "learning_rate": 6.8104069070193e-06, + "loss": 1.6852, + "num_tokens": 38970373.0, + "reward": 1.45703125, + "reward_std": 0.8213835954666138, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4581565484404564, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.38780589401721954, + "step": 1287, + "token_counts/after_target": 1000.75, + "token_counts/after_think": 119.0, + "token_counts/before_target": 2081.5, + "token_counts/before_think": 1000.25 + }, + { + "avg_penalty/after_target": 2.248164802789688, + "avg_penalty/after_think": 3.8512834310531616, + "avg_penalty/before_target": 0.29494886100292206, + "avg_penalty/before_think": 0.42348548769950867, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.25, + "completions/max_terminated_length": 467.25, + "completions/mean_length": 187.546875, + "completions/mean_terminated_length": 187.546875, + "completions/min_length": 36.75, + "completions/min_terminated_length": 36.75, + "epoch": 0.644, + "grad_norm": 6.362381458282471, + "kl": 9.3671875, + "learning_rate": 6.793870094143238e-06, + "loss": 1.0483, + "num_tokens": 38992840.0, + "reward": 1.78515625, + "reward_std": 0.47373178601264954, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.24467839300632477, + "rewards/tag_count_reward/mean": 0.89453125, + "rewards/tag_count_reward/std": 0.22905339300632477, + "step": 1288, + "token_counts/after_target": 247.5, + "token_counts/after_think": 149.5, + "token_counts/before_target": 1535.0, + "token_counts/before_think": 1068.75 + }, + { + "avg_penalty/after_target": 3.227174162864685, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.29590653255581856, + "avg_penalty/before_think": 0.411227785050869, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.25, + "completions/max_terminated_length": 487.25, + "completions/mean_length": 174.4375, + "completions/mean_terminated_length": 174.4375, + "completions/min_length": 35.25, + "completions/min_terminated_length": 35.25, + "epoch": 0.6445, + "grad_norm": 4.6691484451293945, + "kl": 21.4375, + "learning_rate": 6.777343047694891e-06, + "loss": 1.7146, + "num_tokens": 39015924.0, + "reward": 1.49609375, + "reward_std": 0.8555684685707092, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44938503205776215, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.41318075358867645, + "step": 1289, + "token_counts/after_target": 586.75, + "token_counts/after_think": 20.25, + "token_counts/before_target": 1554.25, + "token_counts/before_think": 629.75 + }, + { + "avg_penalty/after_target": 2.525093227624893, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.47568219900131226, + "avg_penalty/before_think": 0.4685956761240959, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.75, + "completions/max_terminated_length": 501.75, + "completions/mean_length": 180.515625, + "completions/mean_terminated_length": 180.515625, + "completions/min_length": 38.25, + "completions/min_terminated_length": 38.25, + "epoch": 0.645, + "grad_norm": 3.446885824203491, + "kl": 20.828125, + "learning_rate": 6.7608258180185085e-06, + "loss": 1.7353, + "num_tokens": 39035941.0, + "reward": 1.5859375, + "reward_std": 0.754236489534378, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4075859263539314, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.36664126068353653, + "step": 1290, + "token_counts/after_target": 646.25, + "token_counts/after_think": 13.0, + "token_counts/before_target": 1488.5, + "token_counts/before_think": 740.5 + }, + { + "avg_penalty/after_target": 2.126791685819626, + "avg_penalty/after_think": 2.9554269909858704, + "avg_penalty/before_target": 0.361503504216671, + "avg_penalty/before_think": 0.5029775351285934, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 676.25, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 173.59375, + "completions/mean_terminated_length": 159.72916793823242, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.6455, + "grad_norm": 4.649471282958984, + "kl": 21.53125, + "learning_rate": 6.744318455428436e-06, + "loss": 1.9365, + "num_tokens": 39055051.0, + "reward": 1.62109375, + "reward_std": 0.7168962955474854, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.38724804669618607, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.3370058350265026, + "step": 1291, + "token_counts/after_target": 375.5, + "token_counts/after_think": 32.0, + "token_counts/before_target": 1702.5, + "token_counts/before_think": 667.5 + }, + { + "avg_penalty/after_target": 2.8821628093719482, + "avg_penalty/after_think": 2.917833387851715, + "avg_penalty/before_target": 0.32572444155812263, + "avg_penalty/before_think": 0.48655659705400467, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.5, + "completions/max_terminated_length": 542.5, + "completions/mean_length": 214.765625, + "completions/mean_terminated_length": 214.765625, + "completions/min_length": 56.75, + "completions/min_terminated_length": 56.75, + "epoch": 0.646, + "grad_norm": 3.5288453102111816, + "kl": 13.55859375, + "learning_rate": 6.727821010208961e-06, + "loss": 1.274, + "num_tokens": 39081100.0, + "reward": 1.8125, + "reward_std": 0.651333287358284, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.1280868947505951, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.37366948276758194, + "rewards/tag_count_reward/mean": 0.890625, + "rewards/tag_count_reward/std": 0.23578011244535446, + "step": 1292, + "token_counts/after_target": 576.75, + "token_counts/after_think": 22.0, + "token_counts/before_target": 1727.0, + "token_counts/before_think": 1110.5 + }, + { + "avg_penalty/after_target": 3.1106388568878174, + "avg_penalty/after_think": 1.4482063055038452, + "avg_penalty/before_target": 0.35895323008298874, + "avg_penalty/before_think": 0.6038519665598869, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.5, + "completions/max_terminated_length": 571.5, + "completions/mean_length": 211.4375, + "completions/mean_terminated_length": 211.4375, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.6465, + "grad_norm": 3.8656113147735596, + "kl": 21.5, + "learning_rate": 6.711333532614168e-06, + "loss": 1.9327, + "num_tokens": 39106408.0, + "reward": 1.5703125, + "reward_std": 0.7636993229389191, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42867646366357803, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.34767450392246246, + "step": 1293, + "token_counts/after_target": 889.75, + "token_counts/after_think": 32.0, + "token_counts/before_target": 1619.75, + "token_counts/before_think": 841.5 + }, + { + "avg_penalty/after_target": 2.6462549567222595, + "avg_penalty/after_think": 1.7410206198692322, + "avg_penalty/before_target": 0.3556036874651909, + "avg_penalty/before_think": 0.4648102968931198, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 635.75, + "completions/max_terminated_length": 525.5, + "completions/mean_length": 179.75, + "completions/mean_terminated_length": 166.21979522705078, + "completions/min_length": 35.25, + "completions/min_terminated_length": 35.25, + "epoch": 0.647, + "grad_norm": 5.632431507110596, + "kl": 28.1171875, + "learning_rate": 6.694856072867772e-06, + "loss": 2.4379, + "num_tokens": 39127544.0, + "reward": 1.609375, + "reward_std": 0.70290157943964, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4057852029800415, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.31877924501895905, + "step": 1294, + "token_counts/after_target": 838.25, + "token_counts/after_think": 6.25, + "token_counts/before_target": 1305.25, + "token_counts/before_think": 726.25 + }, + { + "avg_penalty/after_target": 2.325399696826935, + "avg_penalty/after_think": 2.392383188009262, + "avg_penalty/before_target": 0.485576830804348, + "avg_penalty/before_think": 0.574222519993782, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 786.5, + "completions/max_terminated_length": 718.5, + "completions/mean_length": 279.609375, + "completions/mean_terminated_length": 268.60938262939453, + "completions/min_length": 67.25, + "completions/min_terminated_length": 67.25, + "epoch": 0.6475, + "grad_norm": 12.976705551147461, + "kl": 27.65625, + "learning_rate": 6.67838868116297e-06, + "loss": 1.9679, + "num_tokens": 39157775.0, + "reward": 1.296875, + "reward_std": 0.8355720639228821, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.48558124154806137, + "rewards/tag_count_reward/mean": 0.703125, + "rewards/tag_count_reward/std": 0.3953152000904083, + "step": 1295, + "token_counts/after_target": 991.5, + "token_counts/after_think": 89.0, + "token_counts/before_target": 2256.5, + "token_counts/before_think": 1136.75 + }, + { + "avg_penalty/after_target": 2.035285919904709, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.39537105709314346, + "avg_penalty/before_think": 0.48157264292240143, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.75, + "completions/max_terminated_length": 673.75, + "completions/mean_length": 269.5625, + "completions/mean_terminated_length": 269.5625, + "completions/min_length": 56.5, + "completions/min_terminated_length": 56.5, + "epoch": 0.648, + "grad_norm": 16.147193908691406, + "kl": 31.09375, + "learning_rate": 6.661931407662292e-06, + "loss": 2.1439, + "num_tokens": 39185475.0, + "reward": 1.3203125, + "reward_std": 0.8432074934244156, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.46566852182149887, + "rewards/tag_count_reward/mean": 0.6953125, + "rewards/tag_count_reward/std": 0.3996284157037735, + "step": 1296, + "token_counts/after_target": 846.0, + "token_counts/after_think": 56.75, + "token_counts/before_target": 2533.0, + "token_counts/before_think": 877.25 + }, + { + "avg_penalty/after_target": 2.4550780951976776, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3848141133785248, + "avg_penalty/before_think": 0.6045235507190228, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.5, + "completions/max_terminated_length": 671.5, + "completions/mean_length": 270.875, + "completions/mean_terminated_length": 270.875, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.6485, + "grad_norm": 4.536838531494141, + "kl": 24.296875, + "learning_rate": 6.645484302497452e-06, + "loss": 2.0502, + "num_tokens": 39212763.0, + "reward": 1.36328125, + "reward_std": 0.8288102895021439, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.47865550220012665, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.3828350305557251, + "step": 1297, + "token_counts/after_target": 974.0, + "token_counts/after_think": 184.0, + "token_counts/before_target": 2137.5, + "token_counts/before_think": 1038.5 + }, + { + "avg_penalty/after_target": 2.8680053055286407, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.38569923490285873, + "avg_penalty/before_think": 0.5622728168964386, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 695.5, + "completions/max_terminated_length": 562.0, + "completions/mean_length": 232.65625, + "completions/mean_terminated_length": 220.70417022705078, + "completions/min_length": 55.5, + "completions/min_terminated_length": 55.5, + "epoch": 0.649, + "grad_norm": 10.962584495544434, + "kl": 26.140625, + "learning_rate": 6.629047415769181e-06, + "loss": 1.8831, + "num_tokens": 39235573.0, + "reward": 1.2734375, + "reward_std": 0.8575478196144104, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.4692344516515732, + "rewards/tag_count_reward/mean": 0.6953125, + "rewards/tag_count_reward/std": 0.4248970150947571, + "step": 1298, + "token_counts/after_target": 915.5, + "token_counts/after_think": 6.75, + "token_counts/before_target": 1852.0, + "token_counts/before_think": 948.25 + }, + { + "avg_penalty/after_target": 2.702854812145233, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4677448198199272, + "avg_penalty/before_think": 0.6799461394548416, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 730.75, + "completions/max_terminated_length": 719.75, + "completions/mean_length": 298.453125, + "completions/mean_terminated_length": 288.52396392822266, + "completions/min_length": 62.5, + "completions/min_terminated_length": 62.5, + "epoch": 0.6495, + "grad_norm": 3.546708583831787, + "kl": 24.375, + "learning_rate": 6.612620797547087e-06, + "loss": 2.0428, + "num_tokens": 39265794.0, + "reward": 1.4765625, + "reward_std": 0.7832255512475967, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.46513500809669495, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.36300037056207657, + "step": 1299, + "token_counts/after_target": 1200.25, + "token_counts/after_think": 131.25, + "token_counts/before_target": 2302.25, + "token_counts/before_think": 1141.5 + }, + { + "avg_penalty/after_target": 2.074944257736206, + "avg_penalty/after_think": 3.890765428543091, + "avg_penalty/before_target": 0.4050059840083122, + "avg_penalty/before_think": 0.5960963293910027, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.75, + "completions/max_terminated_length": 468.75, + "completions/mean_length": 223.171875, + "completions/mean_terminated_length": 223.171875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.65, + "grad_norm": 8.817621231079102, + "kl": 20.125, + "learning_rate": 6.596204497869501e-06, + "loss": 1.515, + "num_tokens": 39288749.0, + "reward": 1.51171875, + "reward_std": 0.8028560429811478, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45247192680835724, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.37177254259586334, + "step": 1300, + "token_counts/after_target": 618.75, + "token_counts/after_think": 110.75, + "token_counts/before_target": 1943.0, + "token_counts/before_think": 898.25 + }, + { + "avg_penalty/after_target": 1.8910525441169739, + "avg_penalty/after_think": 1.9387853145599365, + "avg_penalty/before_target": 0.42760762572288513, + "avg_penalty/before_think": 0.3686774671077728, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 629.75, + "completions/max_terminated_length": 629.75, + "completions/mean_length": 201.0, + "completions/mean_terminated_length": 201.0, + "completions/min_length": 52.5, + "completions/min_terminated_length": 52.5, + "epoch": 0.6505, + "grad_norm": 5.892998695373535, + "kl": 26.28125, + "learning_rate": 6.579798566743314e-06, + "loss": 2.0641, + "num_tokens": 39313037.0, + "reward": 1.421875, + "reward_std": 0.9078418463468552, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.479247085750103, + "rewards/tag_count_reward/mean": 0.71875, + "rewards/tag_count_reward/std": 0.4264317825436592, + "step": 1301, + "token_counts/after_target": 513.0, + "token_counts/after_think": 48.0, + "token_counts/before_target": 1965.0, + "token_counts/before_think": 690.0 + }, + { + "avg_penalty/after_target": 2.8877081274986267, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.46493910998106003, + "avg_penalty/before_think": 0.48755790293216705, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 653.5, + "completions/max_terminated_length": 653.5, + "completions/mean_length": 248.484375, + "completions/mean_terminated_length": 248.484375, + "completions/min_length": 79.5, + "completions/min_terminated_length": 79.5, + "epoch": 0.651, + "grad_norm": 4.200225353240967, + "kl": 24.46875, + "learning_rate": 6.56340305414384e-06, + "loss": 2.1199, + "num_tokens": 39336428.0, + "reward": 1.36328125, + "reward_std": 0.8012139201164246, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.42516325414180756, + "rewards/tag_count_reward/mean": 0.70703125, + "rewards/tag_count_reward/std": 0.3928399011492729, + "step": 1302, + "token_counts/after_target": 1048.75, + "token_counts/after_think": 79.5, + "token_counts/before_target": 2035.5, + "token_counts/before_think": 812.0 + }, + { + "avg_penalty/after_target": 2.3248754739761353, + "avg_penalty/after_think": 1.7937341928482056, + "avg_penalty/before_target": 0.38554147630929947, + "avg_penalty/before_think": 0.6024882197380066, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.5, + "completions/max_terminated_length": 549.5, + "completions/mean_length": 249.515625, + "completions/mean_terminated_length": 249.515625, + "completions/min_length": 40.5, + "completions/min_terminated_length": 40.5, + "epoch": 0.6515, + "grad_norm": 4.274703025817871, + "kl": 23.09375, + "learning_rate": 6.547018010014654e-06, + "loss": 1.8903, + "num_tokens": 39366205.0, + "reward": 1.33203125, + "reward_std": 0.8532237708568573, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.46875541657209396, + "rewards/tag_count_reward/mean": 0.69140625, + "rewards/tag_count_reward/std": 0.42273835837841034, + "step": 1303, + "token_counts/after_target": 959.5, + "token_counts/after_think": 57.5, + "token_counts/before_target": 2126.5, + "token_counts/before_think": 848.75 + }, + { + "avg_penalty/after_target": 2.2835479378700256, + "avg_penalty/after_think": 2.426238387823105, + "avg_penalty/before_target": 0.4459420442581177, + "avg_penalty/before_think": 0.7114565819501877, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.25, + "completions/max_terminated_length": 673.25, + "completions/mean_length": 315.65625, + "completions/mean_terminated_length": 315.65625, + "completions/min_length": 62.5, + "completions/min_terminated_length": 62.5, + "epoch": 0.652, + "grad_norm": 5.401803493499756, + "kl": 24.21875, + "learning_rate": 6.530643484267443e-06, + "loss": 1.9482, + "num_tokens": 39400375.0, + "reward": 1.16796875, + "reward_std": 0.8958030641078949, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.53125, + "rewards/format_reward/std": 0.5059641748666763, + "rewards/tag_count_reward/mean": 0.63671875, + "rewards/tag_count_reward/std": 0.42871635407209396, + "step": 1304, + "token_counts/after_target": 1258.5, + "token_counts/after_think": 38.0, + "token_counts/before_target": 2741.0, + "token_counts/before_think": 1013.0 + }, + { + "avg_penalty/after_target": 2.784541815519333, + "avg_penalty/after_think": 3.802373766899109, + "avg_penalty/before_target": 0.5740882828831673, + "avg_penalty/before_think": 0.3941038176417351, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.5, + "completions/max_terminated_length": 628.5, + "completions/mean_length": 237.734375, + "completions/mean_terminated_length": 237.734375, + "completions/min_length": 58.25, + "completions/min_terminated_length": 58.25, + "epoch": 0.6525, + "grad_norm": 12.59223747253418, + "kl": 15.609375, + "learning_rate": 6.5142795267818505e-06, + "loss": 1.7712, + "num_tokens": 39423558.0, + "reward": 1.546875, + "reward_std": 0.7328063249588013, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.42206869274377823, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.3365965783596039, + "step": 1305, + "token_counts/after_target": 935.0, + "token_counts/after_think": 60.25, + "token_counts/before_target": 1834.75, + "token_counts/before_think": 973.75 + }, + { + "avg_penalty/after_target": 3.1474721431732178, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.26730333268642426, + "avg_penalty/before_think": 0.5744863003492355, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.75, + "completions/max_terminated_length": 469.75, + "completions/mean_length": 233.5625, + "completions/mean_terminated_length": 233.5625, + "completions/min_length": 61.25, + "completions/min_terminated_length": 61.25, + "epoch": 0.653, + "grad_norm": 7.72523307800293, + "kl": 12.671875, + "learning_rate": 6.497926187405326e-06, + "loss": 1.3324, + "num_tokens": 39448170.0, + "reward": 1.58203125, + "reward_std": 0.8589016795158386, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.11180340498685837, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4304215610027313, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.39627768099308014, + "step": 1306, + "token_counts/after_target": 729.75, + "token_counts/after_think": 130.75, + "token_counts/before_target": 2004.0, + "token_counts/before_think": 872.5 + }, + { + "avg_penalty/after_target": 2.157068759202957, + "avg_penalty/after_think": 2.8890634775161743, + "avg_penalty/before_target": 0.466670885682106, + "avg_penalty/before_think": 0.6499391347169876, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 702.75, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 292.265625, + "completions/mean_terminated_length": 281.83333587646484, + "completions/min_length": 92.5, + "completions/min_terminated_length": 92.5, + "epoch": 0.6535, + "grad_norm": 12.957975387573242, + "kl": 11.1015625, + "learning_rate": 6.481583515952983e-06, + "loss": 1.3615, + "num_tokens": 39477579.0, + "reward": 1.68359375, + "reward_std": 0.6028014719486237, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.37937305867671967, + "rewards/tag_count_reward/mean": 0.87109375, + "rewards/tag_count_reward/std": 0.2501321528106928, + "step": 1307, + "token_counts/after_target": 837.0, + "token_counts/after_think": 191.75, + "token_counts/before_target": 2232.5, + "token_counts/before_think": 1415.0 + }, + { + "avg_penalty/after_target": 2.1564157009124756, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.4780646786093712, + "avg_penalty/before_think": 0.3851363770663738, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 692.0, + "completions/max_terminated_length": 692.0, + "completions/mean_length": 273.140625, + "completions/mean_terminated_length": 273.140625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.654, + "grad_norm": 6.042799472808838, + "kl": 17.390625, + "learning_rate": 6.465251562207431e-06, + "loss": 1.6472, + "num_tokens": 39508452.0, + "reward": 1.40234375, + "reward_std": 0.8241576850414276, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.44495995342731476, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.3882995620369911, + "step": 1308, + "token_counts/after_target": 1059.0, + "token_counts/after_think": 87.25, + "token_counts/before_target": 2333.25, + "token_counts/before_think": 890.75 + }, + { + "avg_penalty/after_target": 2.3594830930233, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4596225246787071, + "avg_penalty/before_think": 0.474611259996891, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 739.0, + "completions/max_terminated_length": 683.0, + "completions/mean_length": 339.84375, + "completions/mean_terminated_length": 329.29583740234375, + "completions/min_length": 86.75, + "completions/min_terminated_length": 86.75, + "epoch": 0.6545, + "grad_norm": 4.616485595703125, + "kl": 17.125, + "learning_rate": 6.448930375918632e-06, + "loss": 1.6095, + "num_tokens": 39539226.0, + "reward": 1.40625, + "reward_std": 0.8384378999471664, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.466681070625782, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.39689620584249496, + "step": 1309, + "token_counts/after_target": 1281.25, + "token_counts/after_think": 193.0, + "token_counts/before_target": 2685.25, + "token_counts/before_think": 1278.0 + }, + { + "avg_penalty/after_target": 2.28188419342041, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.41924017667770386, + "avg_penalty/before_think": 0.4043452963232994, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.25, + "completions/max_terminated_length": 592.25, + "completions/mean_length": 245.171875, + "completions/mean_terminated_length": 245.171875, + "completions/min_length": 43.25, + "completions/min_terminated_length": 43.25, + "epoch": 0.655, + "grad_norm": 3.34761118888855, + "kl": 19.375, + "learning_rate": 6.432620006803747e-06, + "loss": 1.6302, + "num_tokens": 39569509.0, + "reward": 1.44921875, + "reward_std": 0.9192274659872055, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4550696536898613, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.432015024125576, + "step": 1310, + "token_counts/after_target": 729.75, + "token_counts/after_think": 36.75, + "token_counts/before_target": 2489.25, + "token_counts/before_think": 667.0 + }, + { + "avg_penalty/after_target": 2.029172360897064, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.48860786855220795, + "avg_penalty/before_think": 0.6430510953068733, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.0, + "completions/max_terminated_length": 702.0, + "completions/mean_length": 277.4375, + "completions/mean_terminated_length": 277.4375, + "completions/min_length": 56.75, + "completions/min_terminated_length": 56.75, + "epoch": 0.6555, + "grad_norm": 5.716731071472168, + "kl": 19.375, + "learning_rate": 6.4163205045469975e-06, + "loss": 1.8567, + "num_tokens": 39597921.0, + "reward": 1.51953125, + "reward_std": 0.8102801442146301, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4519384130835533, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.38916847109794617, + "step": 1311, + "token_counts/after_target": 997.5, + "token_counts/after_think": 139.75, + "token_counts/before_target": 2264.75, + "token_counts/before_think": 1037.0 + }, + { + "avg_penalty/after_target": 2.3803292512893677, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.4916902855038643, + "avg_penalty/before_think": 0.4151558503508568, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 681.25, + "completions/max_terminated_length": 681.25, + "completions/mean_length": 283.203125, + "completions/mean_terminated_length": 283.203125, + "completions/min_length": 50.25, + "completions/min_terminated_length": 50.25, + "epoch": 0.656, + "grad_norm": 3.6967875957489014, + "kl": 25.3125, + "learning_rate": 6.4000319187994895e-06, + "loss": 2.2133, + "num_tokens": 39625326.0, + "reward": 1.29296875, + "reward_std": 0.9211780577898026, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.48456869274377823, + "rewards/tag_count_reward/mean": 0.66796875, + "rewards/tag_count_reward/std": 0.4551232233643532, + "step": 1312, + "token_counts/after_target": 1228.25, + "token_counts/after_think": 37.75, + "token_counts/before_target": 2508.5, + "token_counts/before_think": 756.75 + }, + { + "avg_penalty/after_target": 3.2341777086257935, + "avg_penalty/after_think": 1.8861274123191833, + "avg_penalty/before_target": 0.42674100771546364, + "avg_penalty/before_think": 0.7287473380565643, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.5, + "completions/max_terminated_length": 616.5, + "completions/mean_length": 208.734375, + "completions/mean_terminated_length": 208.734375, + "completions/min_length": 49.25, + "completions/min_terminated_length": 49.25, + "epoch": 0.6565, + "grad_norm": 7.384073257446289, + "kl": 20.296875, + "learning_rate": 6.383754299179079e-06, + "loss": 1.9498, + "num_tokens": 39649805.0, + "reward": 1.40234375, + "reward_std": 0.8498368263244629, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.49244368076324463, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.3904201164841652, + "step": 1313, + "token_counts/after_target": 789.0, + "token_counts/after_think": 31.25, + "token_counts/before_target": 1946.25, + "token_counts/before_think": 573.25 + }, + { + "avg_penalty/after_target": 2.8063268065452576, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3816417306661606, + "avg_penalty/before_think": 0.5965049564838409, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 662.75, + "completions/max_terminated_length": 662.75, + "completions/mean_length": 302.8125, + "completions/mean_terminated_length": 302.8125, + "completions/min_length": 78.75, + "completions/min_terminated_length": 78.75, + "epoch": 0.657, + "grad_norm": 3.4234511852264404, + "kl": 17.921875, + "learning_rate": 6.367487695270218e-06, + "loss": 1.6911, + "num_tokens": 39678481.0, + "reward": 1.5546875, + "reward_std": 0.843305230140686, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4229728877544403, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.40779732167720795, + "step": 1314, + "token_counts/after_target": 928.75, + "token_counts/after_think": 108.0, + "token_counts/before_target": 2799.25, + "token_counts/before_think": 1009.0 + }, + { + "avg_penalty/after_target": 2.5898856818675995, + "avg_penalty/after_think": 2.893664538860321, + "avg_penalty/before_target": 0.31361062079668045, + "avg_penalty/before_think": 0.4774973914027214, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.5, + "completions/max_terminated_length": 466.5, + "completions/mean_length": 185.0625, + "completions/mean_terminated_length": 185.0625, + "completions/min_length": 44.5, + "completions/min_terminated_length": 44.5, + "epoch": 0.6575, + "grad_norm": 2.9750382900238037, + "kl": 19.40625, + "learning_rate": 6.351232156623803e-06, + "loss": 1.7131, + "num_tokens": 39701461.0, + "reward": 1.6015625, + "reward_std": 0.7794404774904251, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4097762927412987, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.3795977011322975, + "step": 1315, + "token_counts/after_target": 446.75, + "token_counts/after_think": 22.0, + "token_counts/before_target": 1713.25, + "token_counts/before_think": 779.0 + }, + { + "avg_penalty/after_target": 3.033167153596878, + "avg_penalty/after_think": 3.559792160987854, + "avg_penalty/before_target": 0.2598409466445446, + "avg_penalty/before_think": 0.40652861446142197, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 156.015625, + "completions/mean_terminated_length": 156.015625, + "completions/min_length": 46.25, + "completions/min_terminated_length": 46.25, + "epoch": 0.658, + "grad_norm": 6.203392505645752, + "kl": 16.796875, + "learning_rate": 6.334987732757028e-06, + "loss": 1.3175, + "num_tokens": 39720566.0, + "reward": 1.6015625, + "reward_std": 0.6823674365878105, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.38879410922527313, + "rewards/tag_count_reward/mean": 0.8359375, + "rewards/tag_count_reward/std": 0.32419075071811676, + "step": 1316, + "token_counts/after_target": 294.0, + "token_counts/after_think": 34.75, + "token_counts/before_target": 1486.0, + "token_counts/before_think": 681.5 + }, + { + "avg_penalty/after_target": 2.4831671118736267, + "avg_penalty/after_think": 2.9386860132217407, + "avg_penalty/before_target": 0.3724362924695015, + "avg_penalty/before_think": 0.6264014840126038, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.75, + "completions/max_terminated_length": 547.75, + "completions/mean_length": 235.171875, + "completions/mean_terminated_length": 235.171875, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.6585, + "grad_norm": 3.575869083404541, + "kl": 19.40625, + "learning_rate": 6.318754473153221e-06, + "loss": 1.6374, + "num_tokens": 39746481.0, + "reward": 1.50390625, + "reward_std": 0.8561348915100098, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4519384130835533, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.4124656245112419, + "step": 1317, + "token_counts/after_target": 730.75, + "token_counts/after_think": 121.75, + "token_counts/before_target": 2203.25, + "token_counts/before_think": 707.0 + }, + { + "avg_penalty/after_target": 3.1688477993011475, + "avg_penalty/after_think": 3.708728551864624, + "avg_penalty/before_target": 0.2929413504898548, + "avg_penalty/before_think": 0.45817068964242935, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 163.4375, + "completions/mean_terminated_length": 163.4375, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.659, + "grad_norm": 6.783631801605225, + "kl": 20.2578125, + "learning_rate": 6.302532427261708e-06, + "loss": 1.6474, + "num_tokens": 39767917.0, + "reward": 1.55078125, + "reward_std": 0.7687508910894394, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.41898179799318314, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.36275051534175873, + "step": 1318, + "token_counts/after_target": 284.75, + "token_counts/after_think": 93.25, + "token_counts/before_target": 1497.5, + "token_counts/before_think": 739.5 + }, + { + "avg_penalty/after_target": 2.9469736218452454, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.44473996758461, + "avg_penalty/before_think": 0.524602122604847, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 723.5, + "completions/max_terminated_length": 636.75, + "completions/mean_length": 203.265625, + "completions/mean_terminated_length": 189.93333435058594, + "completions/min_length": 48.75, + "completions/min_terminated_length": 48.75, + "epoch": 0.6595, + "grad_norm": 3.990999698638916, + "kl": 23.81640625, + "learning_rate": 6.286321644497655e-06, + "loss": 2.1457, + "num_tokens": 39792366.0, + "reward": 1.55859375, + "reward_std": 0.7469326853752136, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.41110680997371674, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.34068290144205093, + "step": 1319, + "token_counts/after_target": 798.0, + "token_counts/after_think": 87.0, + "token_counts/before_target": 1647.25, + "token_counts/before_think": 720.0 + }, + { + "avg_penalty/after_target": 2.6773089468479156, + "avg_penalty/after_think": 3.8164060711860657, + "avg_penalty/before_target": 0.32879965752363205, + "avg_penalty/before_think": 0.38032611459493637, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.75, + "completions/max_terminated_length": 602.75, + "completions/mean_length": 182.40625, + "completions/mean_terminated_length": 182.40625, + "completions/min_length": 33.25, + "completions/min_terminated_length": 33.25, + "epoch": 0.66, + "grad_norm": 8.274839401245117, + "kl": 29.375, + "learning_rate": 6.2701221742419106e-06, + "loss": 2.3261, + "num_tokens": 39818408.0, + "reward": 1.49609375, + "reward_std": 0.8387809991836548, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.39617250114679337, + "step": 1320, + "token_counts/after_target": 525.75, + "token_counts/after_think": 79.25, + "token_counts/before_target": 1754.75, + "token_counts/before_think": 558.75 + }, + { + "avg_penalty/after_target": 2.626249134540558, + "avg_penalty/after_think": 2.6193822026252747, + "avg_penalty/before_target": 0.4030301719903946, + "avg_penalty/before_think": 0.5375386998057365, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 685.75, + "completions/max_terminated_length": 685.75, + "completions/mean_length": 228.53125, + "completions/mean_terminated_length": 228.53125, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.6605, + "grad_norm": 4.24309778213501, + "kl": 17.166015625, + "learning_rate": 6.25393406584088e-06, + "loss": 1.6329, + "num_tokens": 39845978.0, + "reward": 1.59375, + "reward_std": 0.6215836107730865, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.3331565484404564, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.30197902768850327, + "step": 1321, + "token_counts/after_target": 839.25, + "token_counts/after_think": 52.25, + "token_counts/before_target": 2023.75, + "token_counts/before_think": 741.25 + }, + { + "avg_penalty/after_target": 2.7694621086120605, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.4212431460618973, + "avg_penalty/before_think": 0.4733528345823288, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.5, + "completions/max_terminated_length": 673.5, + "completions/mean_length": 227.75, + "completions/mean_terminated_length": 227.75, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.661, + "grad_norm": 6.912680149078369, + "kl": 23.203125, + "learning_rate": 6.237757368606345e-06, + "loss": 1.8039, + "num_tokens": 39870330.0, + "reward": 1.390625, + "reward_std": 0.8626396656036377, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.49244368076324463, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.40641241520643234, + "step": 1322, + "token_counts/after_target": 637.0, + "token_counts/after_think": 121.75, + "token_counts/before_target": 2158.0, + "token_counts/before_think": 727.25 + }, + { + "avg_penalty/after_target": 2.4317972660064697, + "avg_penalty/after_think": 2.459755003452301, + "avg_penalty/before_target": 0.44778259098529816, + "avg_penalty/before_think": 0.3900528997182846, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.25, + "completions/max_terminated_length": 631.25, + "completions/mean_length": 185.734375, + "completions/mean_terminated_length": 185.734375, + "completions/min_length": 35.25, + "completions/min_terminated_length": 35.25, + "epoch": 0.6615, + "grad_norm": 3.8468408584594727, + "kl": 22.90625, + "learning_rate": 6.22159213181533e-06, + "loss": 1.8909, + "num_tokens": 39893241.0, + "reward": 1.52734375, + "reward_std": 0.8324806988239288, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.43616948276758194, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.3918021693825722, + "step": 1323, + "token_counts/after_target": 699.25, + "token_counts/after_think": 33.0, + "token_counts/before_target": 1533.0, + "token_counts/before_think": 706.5 + }, + { + "avg_penalty/after_target": 3.008274883031845, + "avg_penalty/after_think": 2.960001230239868, + "avg_penalty/before_target": 0.3015558235347271, + "avg_penalty/before_think": 0.3661585971713066, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.5, + "completions/max_terminated_length": 481.5, + "completions/mean_length": 180.125, + "completions/mean_terminated_length": 180.125, + "completions/min_length": 50.75, + "completions/min_terminated_length": 50.75, + "epoch": 0.662, + "grad_norm": 6.444240093231201, + "kl": 16.6640625, + "learning_rate": 6.205438404709948e-06, + "loss": 1.2296, + "num_tokens": 39916001.0, + "reward": 1.48828125, + "reward_std": 0.8215224295854568, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.44495995342731476, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.39579585939645767, + "step": 1324, + "token_counts/after_target": 350.25, + "token_counts/after_think": 44.75, + "token_counts/before_target": 1655.5, + "token_counts/before_think": 831.5 + }, + { + "avg_penalty/after_target": 2.3232182562351227, + "avg_penalty/after_think": 2.8568630814552307, + "avg_penalty/before_target": 0.3916483372449875, + "avg_penalty/before_think": 0.4813438728451729, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 203.15625, + "completions/mean_terminated_length": 203.15625, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.6625, + "grad_norm": 5.500016689300537, + "kl": 22.3125, + "learning_rate": 6.18929623649726e-06, + "loss": 1.7453, + "num_tokens": 39943067.0, + "reward": 1.4765625, + "reward_std": 0.848240852355957, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4682852029800415, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.4118461161851883, + "step": 1325, + "token_counts/after_target": 577.0, + "token_counts/after_think": 47.5, + "token_counts/before_target": 1923.5, + "token_counts/before_think": 702.5 + }, + { + "avg_penalty/after_target": 2.7086678743362427, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3729509264230728, + "avg_penalty/before_think": 0.6355146318674088, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 626.0, + "completions/max_terminated_length": 518.5, + "completions/mean_length": 212.328125, + "completions/mean_terminated_length": 200.46771240234375, + "completions/min_length": 50.25, + "completions/min_terminated_length": 50.25, + "epoch": 0.663, + "grad_norm": 5.323090553283691, + "kl": 14.0078125, + "learning_rate": 6.173165676349103e-06, + "loss": 1.3894, + "num_tokens": 39965344.0, + "reward": 1.578125, + "reward_std": 0.7046558856964111, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4462348371744156, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.28126004338264465, + "step": 1326, + "token_counts/after_target": 582.75, + "token_counts/after_think": 186.0, + "token_counts/before_target": 1688.25, + "token_counts/before_think": 940.25 + }, + { + "avg_penalty/after_target": 1.8655005991458893, + "avg_penalty/after_think": 3.7592735290527344, + "avg_penalty/before_target": 0.39767996221780777, + "avg_penalty/before_think": 0.5576458647847176, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.0, + "completions/max_terminated_length": 542.0, + "completions/mean_length": 228.375, + "completions/mean_terminated_length": 228.375, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.6635, + "grad_norm": 4.222204208374023, + "kl": 17.7734375, + "learning_rate": 6.157046773401964e-06, + "loss": 1.548, + "num_tokens": 39988888.0, + "reward": 1.484375, + "reward_std": 0.7794777452945709, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.46566852182149887, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.3491932302713394, + "step": 1327, + "token_counts/after_target": 824.0, + "token_counts/after_think": 126.5, + "token_counts/before_target": 1940.5, + "token_counts/before_think": 763.0 + }, + { + "avg_penalty/after_target": 2.935054451227188, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.23985140770673752, + "avg_penalty/before_think": 0.5046053603291512, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.5, + "completions/max_terminated_length": 373.5, + "completions/mean_length": 180.015625, + "completions/mean_terminated_length": 180.015625, + "completions/min_length": 58.5, + "completions/min_terminated_length": 58.5, + "epoch": 0.664, + "grad_norm": 5.305407524108887, + "kl": 18.71875, + "learning_rate": 6.140939576756817e-06, + "loss": 1.4217, + "num_tokens": 40008841.0, + "reward": 1.5625, + "reward_std": 0.7834914326667786, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4440634250640869, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.3559967800974846, + "step": 1328, + "token_counts/after_target": 219.0, + "token_counts/after_think": 119.25, + "token_counts/before_target": 1730.25, + "token_counts/before_think": 811.75 + }, + { + "avg_penalty/after_target": 2.540899157524109, + "avg_penalty/after_think": 2.3859915733337402, + "avg_penalty/before_target": 0.29731645435094833, + "avg_penalty/before_think": 0.46399500221014023, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.5, + "completions/max_terminated_length": 427.5, + "completions/mean_length": 174.421875, + "completions/mean_terminated_length": 174.421875, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.6645, + "grad_norm": 5.605502605438232, + "kl": 12.2109375, + "learning_rate": 6.124844135478971e-06, + "loss": 1.097, + "num_tokens": 40029108.0, + "reward": 1.63671875, + "reward_std": 0.7381281554698944, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.83984375, + "rewards/tag_count_reward/std": 0.34477680176496506, + "step": 1329, + "token_counts/after_target": 400.5, + "token_counts/after_think": 25.75, + "token_counts/before_target": 1340.75, + "token_counts/before_think": 1023.75 + }, + { + "avg_penalty/after_target": 2.297183185815811, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.36120225489139557, + "avg_penalty/before_think": 0.5584772378206253, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.25, + "completions/max_terminated_length": 421.25, + "completions/mean_length": 192.375, + "completions/mean_terminated_length": 192.375, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.665, + "grad_norm": 3.5595734119415283, + "kl": 10.4140625, + "learning_rate": 6.108760498597939e-06, + "loss": 1.0523, + "num_tokens": 40053052.0, + "reward": 1.72265625, + "reward_std": 0.7322920560836792, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.3683478757739067, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.3322341814637184, + "step": 1330, + "token_counts/after_target": 453.5, + "token_counts/after_think": 103.0, + "token_counts/before_target": 1658.25, + "token_counts/before_think": 863.25 + }, + { + "avg_penalty/after_target": 3.101621925830841, + "avg_penalty/after_think": 2.5957183241844177, + "avg_penalty/before_target": 0.367489717900753, + "avg_penalty/before_think": 0.48620984703302383, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.75, + "completions/max_terminated_length": 465.75, + "completions/mean_length": 178.515625, + "completions/mean_terminated_length": 178.515625, + "completions/min_length": 40.75, + "completions/min_terminated_length": 40.75, + "epoch": 0.6655, + "grad_norm": 13.841852188110352, + "kl": 12.8984375, + "learning_rate": 6.092688715107265e-06, + "loss": 1.5245, + "num_tokens": 40075101.0, + "reward": 1.640625, + "reward_std": 0.7334598153829575, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.3141296301037073, + "step": 1331, + "token_counts/after_target": 722.0, + "token_counts/after_think": 41.0, + "token_counts/before_target": 1462.25, + "token_counts/before_think": 631.0 + }, + { + "avg_penalty/after_target": 2.6929361820220947, + "avg_penalty/after_think": 3.8043598532676697, + "avg_penalty/before_target": 0.33413439616560936, + "avg_penalty/before_think": 0.6532198041677475, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 649.5, + "completions/max_terminated_length": 649.5, + "completions/mean_length": 205.5, + "completions/mean_terminated_length": 205.5, + "completions/min_length": 41.75, + "completions/min_terminated_length": 41.75, + "epoch": 0.666, + "grad_norm": 3.960552930831909, + "kl": 12.8046875, + "learning_rate": 6.076628833964389e-06, + "loss": 1.2894, + "num_tokens": 40097821.0, + "reward": 1.71484375, + "reward_std": 0.4725046455860138, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.25409944355487823, + "rewards/tag_count_reward/mean": 0.87109375, + "rewards/tag_count_reward/std": 0.22319824248552322, + "step": 1332, + "token_counts/after_target": 567.5, + "token_counts/after_think": 27.5, + "token_counts/before_target": 1797.0, + "token_counts/before_think": 896.0 + }, + { + "avg_penalty/after_target": 2.7472344040870667, + "avg_penalty/after_think": 3.7953431010246277, + "avg_penalty/before_target": 0.3940073624253273, + "avg_penalty/before_think": 0.4345746859908104, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 148.0, + "completions/mean_terminated_length": 148.0, + "completions/min_length": 47.25, + "completions/min_terminated_length": 47.25, + "epoch": 0.6665, + "grad_norm": 10.745802879333496, + "kl": 10.5625, + "learning_rate": 6.06058090409049e-06, + "loss": 1.2358, + "num_tokens": 40116253.0, + "reward": 1.7265625, + "reward_std": 0.6729106307029724, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.3758598491549492, + "rewards/tag_count_reward/mean": 0.8828125, + "rewards/tag_count_reward/std": 0.30499594658613205, + "step": 1333, + "token_counts/after_target": 370.75, + "token_counts/after_think": 56.5, + "token_counts/before_target": 1298.5, + "token_counts/before_think": 642.25 + }, + { + "avg_penalty/after_target": 2.7293860912323, + "avg_penalty/after_think": 3.5514878630638123, + "avg_penalty/before_target": 0.3975551463663578, + "avg_penalty/before_think": 0.5931065231561661, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 190.625, + "completions/mean_terminated_length": 190.625, + "completions/min_length": 44.25, + "completions/min_terminated_length": 44.25, + "epoch": 0.667, + "grad_norm": 3.772963523864746, + "kl": 20.96875, + "learning_rate": 6.044544974370352e-06, + "loss": 1.7732, + "num_tokens": 40139797.0, + "reward": 1.50390625, + "reward_std": 0.8583057671785355, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4519384130835533, + "rewards/tag_count_reward/mean": 0.75390625, + "rewards/tag_count_reward/std": 0.41579563170671463, + "step": 1334, + "token_counts/after_target": 547.75, + "token_counts/after_think": 54.75, + "token_counts/before_target": 1851.25, + "token_counts/before_think": 596.25 + }, + { + "avg_penalty/after_target": 2.7197549045085907, + "avg_penalty/after_think": 2.9101579785346985, + "avg_penalty/before_target": 0.3177121728658676, + "avg_penalty/before_think": 0.4353831820189953, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.75, + "completions/max_terminated_length": 463.75, + "completions/mean_length": 172.703125, + "completions/mean_terminated_length": 172.703125, + "completions/min_length": 39.25, + "completions/min_terminated_length": 39.25, + "epoch": 0.6675, + "grad_norm": 3.695735216140747, + "kl": 13.7890625, + "learning_rate": 6.028521093652195e-06, + "loss": 1.2175, + "num_tokens": 40161906.0, + "reward": 1.58984375, + "reward_std": 0.6573606729507446, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.3454566150903702, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.3188259154558182, + "step": 1335, + "token_counts/after_target": 406.0, + "token_counts/after_think": 20.5, + "token_counts/before_target": 1400.5, + "token_counts/before_think": 936.25 + }, + { + "avg_penalty/after_target": 2.055222302675247, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.41310184821486473, + "avg_penalty/before_think": 0.43423741310834885, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.75, + "completions/max_terminated_length": 702.75, + "completions/mean_length": 222.40625, + "completions/mean_terminated_length": 222.40625, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.668, + "grad_norm": 3.5412323474884033, + "kl": 18.71875, + "learning_rate": 6.0125093107475385e-06, + "loss": 1.4968, + "num_tokens": 40184444.0, + "reward": 1.51171875, + "reward_std": 0.7949625551700592, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4519384130835533, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.3591733481734991, + "step": 1336, + "token_counts/after_target": 598.75, + "token_counts/after_think": 35.5, + "token_counts/before_target": 1970.0, + "token_counts/before_think": 954.25 + }, + { + "avg_penalty/after_target": 2.329101175069809, + "avg_penalty/after_think": 2.242038309574127, + "avg_penalty/before_target": 0.3904246687889099, + "avg_penalty/before_think": 0.5254567787051201, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 659.0, + "completions/max_terminated_length": 659.0, + "completions/mean_length": 230.921875, + "completions/mean_terminated_length": 230.921875, + "completions/min_length": 57.75, + "completions/min_terminated_length": 57.75, + "epoch": 0.6685, + "grad_norm": 3.0750231742858887, + "kl": 24.03125, + "learning_rate": 5.996509674431053e-06, + "loss": 2.0529, + "num_tokens": 40211079.0, + "reward": 1.46875, + "reward_std": 0.8161628395318985, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.45028156042099, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.3787120580673218, + "step": 1337, + "token_counts/after_target": 803.5, + "token_counts/after_think": 91.75, + "token_counts/before_target": 1914.5, + "token_counts/before_think": 885.0 + }, + { + "avg_penalty/after_target": 2.349090188741684, + "avg_penalty/after_think": 2.8811275959014893, + "avg_penalty/before_target": 0.4808518961071968, + "avg_penalty/before_think": 0.4858979657292366, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 663.5, + "completions/max_terminated_length": 610.5, + "completions/mean_length": 228.171875, + "completions/mean_terminated_length": 216.29896545410156, + "completions/min_length": 41.5, + "completions/min_terminated_length": 41.5, + "epoch": 0.669, + "grad_norm": 3.993795156478882, + "kl": 20.828125, + "learning_rate": 5.9805222334404e-06, + "loss": 1.6948, + "num_tokens": 40237938.0, + "reward": 1.48828125, + "reward_std": 0.8370039612054825, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45247192680835724, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.4001483768224716, + "step": 1338, + "token_counts/after_target": 781.5, + "token_counts/after_think": 50.75, + "token_counts/before_target": 2015.5, + "token_counts/before_think": 803.0 + }, + { + "avg_penalty/after_target": 2.4243495762348175, + "avg_penalty/after_think": 2.707191526889801, + "avg_penalty/before_target": 0.3984391316771507, + "avg_penalty/before_think": 0.5121782273054123, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.0, + "completions/max_terminated_length": 646.0, + "completions/mean_length": 237.671875, + "completions/mean_terminated_length": 237.671875, + "completions/min_length": 37.25, + "completions/min_terminated_length": 37.25, + "epoch": 0.6695, + "grad_norm": 10.419482231140137, + "kl": 21.34375, + "learning_rate": 5.9645470364761e-06, + "loss": 1.503, + "num_tokens": 40268765.0, + "reward": 1.32421875, + "reward_std": 0.8680827915668488, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.4696519449353218, + "rewards/tag_count_reward/mean": 0.68359375, + "rewards/tag_count_reward/std": 0.41009513288736343, + "step": 1339, + "token_counts/after_target": 740.25, + "token_counts/after_think": 45.5, + "token_counts/before_target": 2239.0, + "token_counts/before_think": 778.0 + }, + { + "avg_penalty/after_target": 1.7387487292289734, + "avg_penalty/after_think": 3.7429412603378296, + "avg_penalty/before_target": 0.484852347522974, + "avg_penalty/before_think": 0.632885679602623, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 791.0, + "completions/max_terminated_length": 680.75, + "completions/mean_length": 292.3125, + "completions/mean_terminated_length": 280.6354217529297, + "completions/min_length": 48.5, + "completions/min_terminated_length": 48.5, + "epoch": 0.67, + "grad_norm": 4.296994686126709, + "kl": 21.859375, + "learning_rate": 5.948584132201376e-06, + "loss": 1.8154, + "num_tokens": 40300081.0, + "reward": 1.4296875, + "reward_std": 0.8043037056922913, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.45916909724473953, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.37163787707686424, + "step": 1340, + "token_counts/after_target": 1073.5, + "token_counts/after_think": 134.25, + "token_counts/before_target": 2553.0, + "token_counts/before_think": 916.25 + }, + { + "avg_penalty/after_target": 2.371756434440613, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.34398985654115677, + "avg_penalty/before_think": 0.5216462686657906, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 571.25, + "completions/max_terminated_length": 481.75, + "completions/mean_length": 204.546875, + "completions/mean_terminated_length": 192.48542022705078, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.6705, + "grad_norm": 2.5828750133514404, + "kl": 16.921875, + "learning_rate": 5.932633569242e-06, + "loss": 1.4283, + "num_tokens": 40322324.0, + "reward": 1.52734375, + "reward_std": 0.7975708693265915, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4132782220840454, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.3931238502264023, + "step": 1341, + "token_counts/after_target": 502.25, + "token_counts/after_think": 57.5, + "token_counts/before_target": 1865.5, + "token_counts/before_think": 847.5 + }, + { + "avg_penalty/after_target": 3.19840544462204, + "avg_penalty/after_think": 2.3533346354961395, + "avg_penalty/before_target": 0.37869948893785477, + "avg_penalty/before_think": 0.38559599593281746, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.5, + "completions/max_terminated_length": 549.5, + "completions/mean_length": 241.421875, + "completions/mean_terminated_length": 241.421875, + "completions/min_length": 63.75, + "completions/min_terminated_length": 63.75, + "epoch": 0.671, + "grad_norm": 3.6294686794281006, + "kl": 20.90625, + "learning_rate": 5.9166953961861536e-06, + "loss": 1.7915, + "num_tokens": 40347119.0, + "reward": 1.38671875, + "reward_std": 0.8586315512657166, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.4546433389186859, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.41998203843832016, + "step": 1342, + "token_counts/after_target": 932.5, + "token_counts/after_think": 10.75, + "token_counts/before_target": 2131.5, + "token_counts/before_think": 788.0 + }, + { + "avg_penalty/after_target": 2.7882482409477234, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3111506775021553, + "avg_penalty/before_think": 0.5183211416006088, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.25, + "completions/max_terminated_length": 569.25, + "completions/mean_length": 194.53125, + "completions/mean_terminated_length": 194.53125, + "completions/min_length": 42.5, + "completions/min_terminated_length": 42.5, + "epoch": 0.6715, + "grad_norm": 3.939988136291504, + "kl": 16.171875, + "learning_rate": 5.900769661584273e-06, + "loss": 1.5105, + "num_tokens": 40369073.0, + "reward": 1.51953125, + "reward_std": 0.7564997971057892, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.41737766563892365, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.3735746368765831, + "step": 1343, + "token_counts/after_target": 588.0, + "token_counts/after_think": 42.75, + "token_counts/before_target": 1782.25, + "token_counts/before_think": 699.5 + }, + { + "avg_penalty/after_target": 2.272534966468811, + "avg_penalty/after_think": 3.3533345758914948, + "avg_penalty/before_target": 0.3864738382399082, + "avg_penalty/before_think": 0.4247545301914215, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 745.75, + "completions/max_terminated_length": 591.5, + "completions/mean_length": 202.9375, + "completions/mean_terminated_length": 189.7052116394043, + "completions/min_length": 46.5, + "completions/min_terminated_length": 46.5, + "epoch": 0.672, + "grad_norm": 2.9247515201568604, + "kl": 23.6875, + "learning_rate": 5.884856413948913e-06, + "loss": 1.9394, + "num_tokens": 40392445.0, + "reward": 1.44921875, + "reward_std": 0.8729864954948425, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4625816270709038, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.41983022540807724, + "step": 1344, + "token_counts/after_target": 414.25, + "token_counts/after_think": 17.75, + "token_counts/before_target": 2075.5, + "token_counts/before_think": 739.5 + }, + { + "avg_penalty/after_target": 2.901423990726471, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.33868322148919106, + "avg_penalty/before_think": 0.36269616335630417, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 606.75, + "completions/max_terminated_length": 545.25, + "completions/mean_length": 202.34375, + "completions/mean_terminated_length": 190.64791870117188, + "completions/min_length": 43.75, + "completions/min_terminated_length": 43.75, + "epoch": 0.6725, + "grad_norm": 10.368449211120605, + "kl": 11.63671875, + "learning_rate": 5.868955701754584e-06, + "loss": 1.3728, + "num_tokens": 40418403.0, + "reward": 1.78515625, + "reward_std": 0.5023853927850723, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.28694770485162735, + "rewards/tag_count_reward/mean": 0.91015625, + "rewards/tag_count_reward/std": 0.23217133432626724, + "step": 1345, + "token_counts/after_target": 480.0, + "token_counts/after_think": 132.0, + "token_counts/before_target": 1790.0, + "token_counts/before_think": 835.5 + }, + { + "avg_penalty/after_target": 2.3145124316215515, + "avg_penalty/after_think": 3.741288125514984, + "avg_penalty/before_target": 0.3116474375128746, + "avg_penalty/before_think": 0.6311537548899651, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 664.25, + "completions/max_terminated_length": 664.25, + "completions/mean_length": 184.125, + "completions/mean_terminated_length": 184.125, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.673, + "grad_norm": 6.891303539276123, + "kl": 15.53125, + "learning_rate": 5.853067573437612e-06, + "loss": 1.5567, + "num_tokens": 40440603.0, + "reward": 1.61328125, + "reward_std": 0.7678661793470383, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.42516325414180756, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.35904594510793686, + "step": 1346, + "token_counts/after_target": 410.25, + "token_counts/after_think": 35.5, + "token_counts/before_target": 1815.25, + "token_counts/before_think": 685.0 + }, + { + "avg_penalty/after_target": 2.6585782170295715, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.47187984734773636, + "avg_penalty/before_think": 0.5223656445741653, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 665.0, + "completions/max_terminated_length": 665.0, + "completions/mean_length": 277.5625, + "completions/mean_terminated_length": 277.5625, + "completions/min_length": 49.75, + "completions/min_terminated_length": 49.75, + "epoch": 0.6735, + "grad_norm": 6.496009349822998, + "kl": 17.203125, + "learning_rate": 5.83719207739599e-06, + "loss": 1.6677, + "num_tokens": 40471119.0, + "reward": 1.5625, + "reward_std": 0.7989805787801743, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4255262687802315, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.3842107132077217, + "step": 1347, + "token_counts/after_target": 921.0, + "token_counts/after_think": 49.0, + "token_counts/before_target": 2103.0, + "token_counts/before_think": 1368.0 + }, + { + "avg_penalty/after_target": 2.663465529680252, + "avg_penalty/after_think": 3.580567240715027, + "avg_penalty/before_target": 0.3455043435096741, + "avg_penalty/before_think": 0.46367668360471725, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 588.5, + "completions/max_terminated_length": 588.5, + "completions/mean_length": 231.71875, + "completions/mean_terminated_length": 231.71875, + "completions/min_length": 49.25, + "completions/min_terminated_length": 49.25, + "epoch": 0.674, + "grad_norm": 11.967328071594238, + "kl": 12.6640625, + "learning_rate": 5.82132926198923e-06, + "loss": 1.4902, + "num_tokens": 40494973.0, + "reward": 1.69140625, + "reward_std": 0.7207631170749664, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.3723389655351639, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.35314271599054337, + "step": 1348, + "token_counts/after_target": 683.25, + "token_counts/after_think": 87.75, + "token_counts/before_target": 1841.5, + "token_counts/before_think": 1095.0 + }, + { + "avg_penalty/after_target": 2.4891494512557983, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.395147442817688, + "avg_penalty/before_think": 0.3270818889141083, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.25, + "completions/max_terminated_length": 539.25, + "completions/mean_length": 166.8125, + "completions/mean_terminated_length": 166.8125, + "completions/min_length": 44.5, + "completions/min_terminated_length": 44.5, + "epoch": 0.6745, + "grad_norm": 4.777812480926514, + "kl": 21.2265625, + "learning_rate": 5.8054791755382286e-06, + "loss": 1.9828, + "num_tokens": 40515489.0, + "reward": 1.63671875, + "reward_std": 0.6440555155277252, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.33226002007722855, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.31442970782518387, + "step": 1349, + "token_counts/after_target": 465.0, + "token_counts/after_think": 67.5, + "token_counts/before_target": 1479.5, + "token_counts/before_think": 657.0 + }, + { + "avg_penalty/after_target": 2.255237877368927, + "avg_penalty/after_think": 3.753013491630554, + "avg_penalty/before_target": 0.5051050335168839, + "avg_penalty/before_think": 0.7143343389034271, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.5, + "completions/max_terminated_length": 668.5, + "completions/mean_length": 218.9375, + "completions/mean_terminated_length": 218.9375, + "completions/min_length": 38.75, + "completions/min_terminated_length": 38.75, + "epoch": 0.675, + "grad_norm": 6.189041614532471, + "kl": 17.78125, + "learning_rate": 5.789641866325091e-06, + "loss": 1.7802, + "num_tokens": 40543517.0, + "reward": 1.53515625, + "reward_std": 0.778958261013031, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44187305867671967, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.3628995306789875, + "step": 1350, + "token_counts/after_target": 811.0, + "token_counts/after_think": 140.0, + "token_counts/before_target": 1801.75, + "token_counts/before_think": 750.25 + }, + { + "avg_penalty/after_target": 2.7367680370807648, + "avg_penalty/after_think": 2.9949596524238586, + "avg_penalty/before_target": 0.3092820607125759, + "avg_penalty/before_think": 0.43967337906360626, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.5, + "completions/max_terminated_length": 388.5, + "completions/mean_length": 150.296875, + "completions/mean_terminated_length": 150.296875, + "completions/min_length": 34.75, + "completions/min_terminated_length": 34.75, + "epoch": 0.6755, + "grad_norm": 3.777235507965088, + "kl": 14.9140625, + "learning_rate": 5.773817382593008e-06, + "loss": 1.3802, + "num_tokens": 40563344.0, + "reward": 1.703125, + "reward_std": 0.5571124106645584, + "rewards/accuracy_reward/mean": NaN, + "rewards/accuracy_reward/std": NaN, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.2882782220840454, + "rewards/tag_count_reward/mean": 0.859375, + "rewards/tag_count_reward/std": 0.2737811356782913, + "step": 1351, + "token_counts/after_target": 260.5, + "token_counts/after_think": 74.75, + "token_counts/before_target": 1273.75, + "token_counts/before_think": 795.75 + }, + { + "avg_penalty/after_target": 2.4282293617725372, + "avg_penalty/after_think": 3.496210277080536, + "avg_penalty/before_target": 0.27927348017692566, + "avg_penalty/before_think": 0.37509769946336746, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 143.015625, + "completions/mean_terminated_length": 143.015625, + "completions/min_length": 37.5, + "completions/min_terminated_length": 37.5, + "epoch": 0.676, + "grad_norm": 4.407766819000244, + "kl": 14.6875, + "learning_rate": 5.758005772546097e-06, + "loss": 1.4049, + "num_tokens": 40581969.0, + "reward": 1.7265625, + "reward_std": 0.6733666509389877, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3450859263539314, + "rewards/tag_count_reward/mean": 0.8671875, + "rewards/tag_count_reward/std": 0.32921820878982544, + "step": 1352, + "token_counts/after_target": 212.0, + "token_counts/after_think": 54.25, + "token_counts/before_target": 1177.25, + "token_counts/before_think": 844.75 + }, + { + "avg_penalty/after_target": 2.3473637998104095, + "avg_penalty/after_think": 2.8606510162353516, + "avg_penalty/before_target": 0.4253913015127182, + "avg_penalty/before_think": 0.449380025267601, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 619.0, + "completions/max_terminated_length": 619.0, + "completions/mean_length": 188.484375, + "completions/mean_terminated_length": 188.484375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.6765, + "grad_norm": 4.311939239501953, + "kl": 26.4375, + "learning_rate": 5.742207084349274e-06, + "loss": 2.1898, + "num_tokens": 40602912.0, + "reward": 1.51953125, + "reward_std": 0.8362147957086563, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.45508860796689987, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.39622336626052856, + "step": 1353, + "token_counts/after_target": 621.0, + "token_counts/after_think": 31.25, + "token_counts/before_target": 1674.0, + "token_counts/before_think": 689.5 + }, + { + "avg_penalty/after_target": 2.4719059765338898, + "avg_penalty/after_think": 2.801143169403076, + "avg_penalty/before_target": 0.4738684445619583, + "avg_penalty/before_think": 0.40962156653404236, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 718.5, + "completions/max_terminated_length": 606.75, + "completions/mean_length": 234.171875, + "completions/mean_terminated_length": 221.6500015258789, + "completions/min_length": 46.75, + "completions/min_terminated_length": 46.75, + "epoch": 0.677, + "grad_norm": 10.299633979797363, + "kl": 25.875, + "learning_rate": 5.726421366128076e-06, + "loss": 1.9464, + "num_tokens": 40629499.0, + "reward": 1.5, + "reward_std": 0.8697581589221954, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.4094010069966316, + "step": 1354, + "token_counts/after_target": 809.25, + "token_counts/after_think": 20.25, + "token_counts/before_target": 2188.0, + "token_counts/before_think": 729.25 + }, + { + "avg_penalty/after_target": 2.2980062067508698, + "avg_penalty/after_think": 2.3614553809165955, + "avg_penalty/before_target": 0.445321723818779, + "avg_penalty/before_think": 0.49299565702676773, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 675.75, + "completions/max_terminated_length": 528.25, + "completions/mean_length": 220.84375, + "completions/mean_terminated_length": 207.59479522705078, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.6775, + "grad_norm": 7.670156955718994, + "kl": 27.921875, + "learning_rate": 5.710648665968543e-06, + "loss": 2.181, + "num_tokens": 40654337.0, + "reward": 1.52734375, + "reward_std": 0.824169784784317, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.43708496540784836, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.3807876333594322, + "step": 1355, + "token_counts/after_target": 738.5, + "token_counts/after_think": 88.5, + "token_counts/before_target": 1858.25, + "token_counts/before_think": 848.25 + }, + { + "avg_penalty/after_target": 2.6355346143245697, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.39651478454470634, + "avg_penalty/before_think": 0.3754347190260887, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 176.84375, + "completions/mean_terminated_length": 176.84375, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.678, + "grad_norm": 4.366046905517578, + "kl": 24.71875, + "learning_rate": 5.694889031917047e-06, + "loss": 2.1704, + "num_tokens": 40678903.0, + "reward": 1.5390625, + "reward_std": 0.8452775180339813, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42867646366357803, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.4175449535250664, + "step": 1356, + "token_counts/after_target": 618.75, + "token_counts/after_think": 78.0, + "token_counts/before_target": 1535.75, + "token_counts/before_think": 597.0 + }, + { + "avg_penalty/after_target": 2.6133302450180054, + "avg_penalty/after_think": 3.1064553260803223, + "avg_penalty/before_target": 0.5413467176258564, + "avg_penalty/before_think": 0.41270630061626434, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 604.75, + "completions/max_terminated_length": 458.5, + "completions/mean_length": 150.734375, + "completions/mean_terminated_length": 136.64166831970215, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.6785, + "grad_norm": 7.4177021980285645, + "kl": 22.734375, + "learning_rate": 5.679142511980176e-06, + "loss": 2.1261, + "num_tokens": 40700438.0, + "reward": 1.7265625, + "reward_std": 0.761127308011055, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38336414843797684, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.33862946927547455, + "step": 1357, + "token_counts/after_target": 360.75, + "token_counts/after_think": 36.25, + "token_counts/before_target": 1308.5, + "token_counts/before_think": 706.25 + }, + { + "avg_penalty/after_target": 2.012408971786499, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3939330726861954, + "avg_penalty/before_think": 0.3897639065980911, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 693.25, + "completions/max_terminated_length": 693.25, + "completions/mean_length": 198.53125, + "completions/mean_terminated_length": 198.53125, + "completions/min_length": 30.5, + "completions/min_terminated_length": 30.5, + "epoch": 0.679, + "grad_norm": 13.682015419006348, + "kl": 30.25, + "learning_rate": 5.663409154124557e-06, + "loss": 2.1865, + "num_tokens": 40723736.0, + "reward": 1.4296875, + "reward_std": 0.8565651327371597, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.46034691482782364, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.41791941225528717, + "step": 1358, + "token_counts/after_target": 624.5, + "token_counts/after_think": 31.0, + "token_counts/before_target": 1848.75, + "token_counts/before_think": 672.25 + }, + { + "avg_penalty/after_target": 3.2108626067638397, + "avg_penalty/after_think": 1.8742886781692505, + "avg_penalty/before_target": 0.30096760019659996, + "avg_penalty/before_think": 0.42041002586483955, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 638.0, + "completions/max_terminated_length": 460.75, + "completions/mean_length": 160.734375, + "completions/mean_terminated_length": 146.43854331970215, + "completions/min_length": 31.75, + "completions/min_terminated_length": 31.75, + "epoch": 0.6795, + "grad_norm": 6.657754898071289, + "kl": 29.125, + "learning_rate": 5.647689006276727e-06, + "loss": 2.4258, + "num_tokens": 40746375.0, + "reward": 1.47265625, + "reward_std": 0.850959450006485, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4515564441680908, + "rewards/tag_count_reward/mean": 0.75390625, + "rewards/tag_count_reward/std": 0.4078143760561943, + "step": 1359, + "token_counts/after_target": 534.5, + "token_counts/after_think": 31.0, + "token_counts/before_target": 1310.0, + "token_counts/before_think": 696.25 + }, + { + "avg_penalty/after_target": 2.814944803714752, + "avg_penalty/after_think": 2.7754199504852295, + "avg_penalty/before_target": 0.3358500078320503, + "avg_penalty/before_think": 0.42413175851106644, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.25, + "completions/max_terminated_length": 822.25, + "completions/mean_length": 215.140625, + "completions/mean_terminated_length": 215.140625, + "completions/min_length": 37.25, + "completions/min_terminated_length": 37.25, + "epoch": 0.68, + "grad_norm": 6.4755706787109375, + "kl": 22.90625, + "learning_rate": 5.631982116322981e-06, + "loss": 1.7971, + "num_tokens": 40775808.0, + "reward": 1.5390625, + "reward_std": 0.8092748671770096, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.43655145168304443, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.3886198028922081, + "step": 1360, + "token_counts/after_target": 564.25, + "token_counts/after_think": 19.5, + "token_counts/before_target": 1965.75, + "token_counts/before_think": 892.75 + }, + { + "avg_penalty/after_target": 2.4752378165721893, + "avg_penalty/after_think": 3.708096504211426, + "avg_penalty/before_target": 0.2681034803390503, + "avg_penalty/before_think": 0.4204631373286247, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 156.890625, + "completions/mean_terminated_length": 156.890625, + "completions/min_length": 41.75, + "completions/min_terminated_length": 41.75, + "epoch": 0.6805, + "grad_norm": 8.768710136413574, + "kl": 16.96875, + "learning_rate": 5.616288532109225e-06, + "loss": 1.2332, + "num_tokens": 40796105.0, + "reward": 1.609375, + "reward_std": 0.7955547422170639, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4097762927412987, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.3906753733754158, + "step": 1361, + "token_counts/after_target": 283.25, + "token_counts/after_think": 57.0, + "token_counts/before_target": 1291.0, + "token_counts/before_think": 879.0 + }, + { + "avg_penalty/after_target": 3.248130738735199, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.43070439249277115, + "avg_penalty/before_think": 0.3890530802309513, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 595.25, + "completions/max_terminated_length": 497.75, + "completions/mean_length": 180.59375, + "completions/mean_terminated_length": 168.34167098999023, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.681, + "grad_norm": 5.289531230926514, + "kl": 28.5, + "learning_rate": 5.600608301440848e-06, + "loss": 2.2836, + "num_tokens": 40817839.0, + "reward": 1.546875, + "reward_std": 0.8884435892105103, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4462348371744156, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.3966154381632805, + "step": 1362, + "token_counts/after_target": 713.25, + "token_counts/after_think": 56.5, + "token_counts/before_target": 1456.5, + "token_counts/before_think": 663.25 + }, + { + "avg_penalty/after_target": 2.7847838401794434, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.2954787015914917, + "avg_penalty/before_think": 0.5686619952321053, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.5, + "completions/max_terminated_length": 535.5, + "completions/mean_length": 180.0625, + "completions/mean_terminated_length": 180.0625, + "completions/min_length": 41.5, + "completions/min_terminated_length": 41.5, + "epoch": 0.6815, + "grad_norm": 10.903360366821289, + "kl": 26.40625, + "learning_rate": 5.584941472082549e-06, + "loss": 1.9041, + "num_tokens": 40840643.0, + "reward": 1.390625, + "reward_std": 0.8968610018491745, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.4757782220840454, + "rewards/tag_count_reward/mean": 0.71875, + "rewards/tag_count_reward/std": 0.44028453528881073, + "step": 1363, + "token_counts/after_target": 484.25, + "token_counts/after_think": 34.75, + "token_counts/before_target": 1668.75, + "token_counts/before_think": 693.25 + }, + { + "avg_penalty/after_target": 2.59529846906662, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.30697184428572655, + "avg_penalty/before_think": 0.37766433507204056, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 123.34375, + "completions/mean_terminated_length": 123.34375, + "completions/min_length": 42.25, + "completions/min_terminated_length": 42.25, + "epoch": 0.682, + "grad_norm": 12.596790313720703, + "kl": 13.0859375, + "learning_rate": 5.569288091758205e-06, + "loss": 1.5232, + "num_tokens": 40858345.0, + "reward": 1.7734375, + "reward_std": 0.5891415029764175, + "rewards/accuracy_reward/mean": NaN, + "rewards/accuracy_reward/std": NaN, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.31116948276758194, + "rewards/tag_count_reward/mean": 0.8828125, + "rewards/tag_count_reward/std": 0.28372376412153244, + "step": 1364, + "token_counts/after_target": 296.0, + "token_counts/after_think": 29.0, + "token_counts/before_target": 1038.75, + "token_counts/before_think": 609.75 + }, + { + "avg_penalty/after_target": 3.1286563873291016, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4284762851893902, + "avg_penalty/before_think": 0.35716672986745834, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.25, + "completions/max_terminated_length": 601.25, + "completions/mean_length": 174.90625, + "completions/mean_terminated_length": 174.90625, + "completions/min_length": 36.5, + "completions/min_terminated_length": 36.5, + "epoch": 0.6825, + "grad_norm": 7.70693302154541, + "kl": 21.125, + "learning_rate": 5.553648208150728e-06, + "loss": 1.9556, + "num_tokens": 40878179.0, + "reward": 1.49609375, + "reward_std": 0.7781267464160919, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.42739029973745346, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.36656907200813293, + "step": 1365, + "token_counts/after_target": 646.75, + "token_counts/after_think": 28.5, + "token_counts/before_target": 1502.75, + "token_counts/before_think": 620.5 + }, + { + "avg_penalty/after_target": 2.3992600440979004, + "avg_penalty/after_think": 2.8965219259262085, + "avg_penalty/before_target": 0.29059863090515137, + "avg_penalty/before_think": 0.42258331179618835, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 606.75, + "completions/max_terminated_length": 606.75, + "completions/mean_length": 174.53125, + "completions/mean_terminated_length": 174.53125, + "completions/min_length": 37.75, + "completions/min_terminated_length": 37.75, + "epoch": 0.683, + "grad_norm": 6.975317478179932, + "kl": 24.40625, + "learning_rate": 5.5380218689019125e-06, + "loss": 1.8845, + "num_tokens": 40900549.0, + "reward": 1.48828125, + "reward_std": 0.8814304172992706, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.45508860796689987, + "rewards/tag_count_reward/mean": 0.75390625, + "rewards/tag_count_reward/std": 0.432202510535717, + "step": 1366, + "token_counts/after_target": 423.0, + "token_counts/after_think": 31.0, + "token_counts/before_target": 1696.5, + "token_counts/before_think": 642.0 + }, + { + "avg_penalty/after_target": 2.570481389760971, + "avg_penalty/after_think": 2.89497172832489, + "avg_penalty/before_target": 0.25286876782774925, + "avg_penalty/before_think": 0.4557325914502144, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.75, + "completions/max_terminated_length": 364.75, + "completions/mean_length": 153.5625, + "completions/mean_terminated_length": 153.5625, + "completions/min_length": 54.75, + "completions/min_terminated_length": 54.75, + "epoch": 0.6835, + "grad_norm": 6.21677827835083, + "kl": 13.73046875, + "learning_rate": 5.522409121612304e-06, + "loss": 1.3874, + "num_tokens": 40919993.0, + "reward": 1.6875, + "reward_std": 0.6396308541297913, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.35648179799318314, + "rewards/tag_count_reward/mean": 0.859375, + "rewards/tag_count_reward/std": 0.2903648614883423, + "step": 1367, + "token_counts/after_target": 309.0, + "token_counts/after_think": 21.0, + "token_counts/before_target": 1186.0, + "token_counts/before_think": 941.0 + }, + { + "avg_penalty/after_target": 2.502702057361603, + "avg_penalty/after_think": 2.535111665725708, + "avg_penalty/before_target": 0.53255545347929, + "avg_penalty/before_think": 0.4549228586256504, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 683.5, + "completions/max_terminated_length": 683.5, + "completions/mean_length": 264.53125, + "completions/mean_terminated_length": 264.53125, + "completions/min_length": 67.75, + "completions/min_terminated_length": 67.75, + "epoch": 0.684, + "grad_norm": 8.458571434020996, + "kl": 22.109375, + "learning_rate": 5.506810013841036e-06, + "loss": 2.1179, + "num_tokens": 40944955.0, + "reward": 1.5546875, + "reward_std": 0.8171216249465942, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4229728877544403, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.40087413042783737, + "step": 1368, + "token_counts/after_target": 1229.75, + "token_counts/after_think": 102.25, + "token_counts/before_target": 1959.25, + "token_counts/before_think": 941.25 + }, + { + "avg_penalty/after_target": 2.3212905824184418, + "avg_penalty/after_think": 1.904992401599884, + "avg_penalty/before_target": 0.3554445654153824, + "avg_penalty/before_think": 0.3877792991697788, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.5, + "completions/max_terminated_length": 493.5, + "completions/mean_length": 181.375, + "completions/mean_terminated_length": 181.375, + "completions/min_length": 43.5, + "completions/min_terminated_length": 43.5, + "epoch": 0.6845, + "grad_norm": 5.167878150939941, + "kl": 22.453125, + "learning_rate": 5.491224593105695e-06, + "loss": 1.9717, + "num_tokens": 40968275.0, + "reward": 1.62109375, + "reward_std": 0.7284673601388931, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.39656074345111847, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.34825993329286575, + "step": 1369, + "token_counts/after_target": 547.5, + "token_counts/after_think": 7.75, + "token_counts/before_target": 1653.0, + "token_counts/before_think": 693.75 + }, + { + "avg_penalty/after_target": 1.7734675109386444, + "avg_penalty/after_think": 3.288873612880707, + "avg_penalty/before_target": 0.3951355069875717, + "avg_penalty/before_think": 0.5295819863677025, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.5, + "completions/max_terminated_length": 548.5, + "completions/mean_length": 167.25, + "completions/mean_terminated_length": 167.25, + "completions/min_length": 33.5, + "completions/min_terminated_length": 33.5, + "epoch": 0.685, + "grad_norm": 4.026068687438965, + "kl": 18.453125, + "learning_rate": 5.475652906882173e-06, + "loss": 1.6343, + "num_tokens": 40988995.0, + "reward": 1.625, + "reward_std": 0.6989090144634247, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.3890564441680908, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.3145184591412544, + "step": 1370, + "token_counts/after_target": 340.25, + "token_counts/after_think": 39.75, + "token_counts/before_target": 1458.75, + "token_counts/before_think": 837.25 + }, + { + "avg_penalty/after_target": 2.67399862408638, + "avg_penalty/after_think": 2.468068540096283, + "avg_penalty/before_target": 0.3111100345849991, + "avg_penalty/before_think": 0.323847733438015, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 527.0, + "completions/max_terminated_length": 527.0, + "completions/mean_length": 172.625, + "completions/mean_terminated_length": 172.625, + "completions/min_length": 39.25, + "completions/min_terminated_length": 39.25, + "epoch": 0.6855, + "grad_norm": 3.0338497161865234, + "kl": 20.515625, + "learning_rate": 5.460095002604533e-06, + "loss": 1.7937, + "num_tokens": 41011595.0, + "reward": 1.6171875, + "reward_std": 0.788042888045311, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4141380712389946, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.37942515313625336, + "step": 1371, + "token_counts/after_target": 390.0, + "token_counts/after_think": 27.25, + "token_counts/before_target": 1746.25, + "token_counts/before_think": 598.5 + }, + { + "avg_penalty/after_target": 2.039978474378586, + "avg_penalty/after_think": 2.427881598472595, + "avg_penalty/before_target": 0.45613787323236465, + "avg_penalty/before_think": 0.4451811760663986, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.25, + "completions/max_terminated_length": 600.25, + "completions/mean_length": 173.875, + "completions/mean_terminated_length": 173.875, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.686, + "grad_norm": 4.368394374847412, + "kl": 20.8515625, + "learning_rate": 5.444550927664847e-06, + "loss": 1.8983, + "num_tokens": 41033907.0, + "reward": 1.65625, + "reward_std": 0.6093225926160812, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.32474804669618607, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.2946983128786087, + "step": 1372, + "token_counts/after_target": 581.5, + "token_counts/after_think": 25.0, + "token_counts/before_target": 1481.75, + "token_counts/before_think": 693.75 + }, + { + "avg_penalty/after_target": 2.74391907453537, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.36082156002521515, + "avg_penalty/before_think": 0.5567894354462624, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 581.75, + "completions/max_terminated_length": 581.75, + "completions/mean_length": 229.1875, + "completions/mean_terminated_length": 229.1875, + "completions/min_length": 50.25, + "completions/min_terminated_length": 50.25, + "epoch": 0.6865, + "grad_norm": 2.4593260288238525, + "kl": 13.8125, + "learning_rate": 5.429020729413062e-06, + "loss": 1.2236, + "num_tokens": 41067279.0, + "reward": 1.55078125, + "reward_std": 0.7972749918699265, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4229728877544403, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.39921239763498306, + "step": 1373, + "token_counts/after_target": 910.25, + "token_counts/after_think": 82.5, + "token_counts/before_target": 1985.25, + "token_counts/before_think": 689.0 + }, + { + "avg_penalty/after_target": 2.607286125421524, + "avg_penalty/after_think": 1.7800734639167786, + "avg_penalty/before_target": 0.26681042835116386, + "avg_penalty/before_think": 0.5030723288655281, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 146.65625, + "completions/mean_terminated_length": 146.65625, + "completions/min_length": 34.75, + "completions/min_terminated_length": 34.75, + "epoch": 0.687, + "grad_norm": 5.498472690582275, + "kl": 17.21875, + "learning_rate": 5.413504455156855e-06, + "loss": 1.3666, + "num_tokens": 41086217.0, + "reward": 1.640625, + "reward_std": 0.7413071691989899, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4097762927412987, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.3448760285973549, + "step": 1374, + "token_counts/after_target": 277.75, + "token_counts/after_think": 15.25, + "token_counts/before_target": 1211.75, + "token_counts/before_think": 841.75 + }, + { + "avg_penalty/after_target": 2.1034542322158813, + "avg_penalty/after_think": 2.8666528463363647, + "avg_penalty/before_target": 0.40553295984864235, + "avg_penalty/before_think": 0.38226867467164993, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.75, + "completions/max_terminated_length": 529.75, + "completions/mean_length": 167.109375, + "completions/mean_terminated_length": 167.109375, + "completions/min_length": 54.75, + "completions/min_terminated_length": 54.75, + "epoch": 0.6875, + "grad_norm": 2.5869743824005127, + "kl": 13.34375, + "learning_rate": 5.398002152161484e-06, + "loss": 1.2384, + "num_tokens": 41107440.0, + "reward": 1.73828125, + "reward_std": 0.6140089184045792, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.3529609143733978, + "rewards/tag_count_reward/mean": 0.89453125, + "rewards/tag_count_reward/std": 0.27458784729242325, + "step": 1375, + "token_counts/after_target": 392.75, + "token_counts/after_think": 30.5, + "token_counts/before_target": 1449.0, + "token_counts/before_think": 801.5 + }, + { + "avg_penalty/after_target": 2.5683586597442627, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.5202945731580257, + "avg_penalty/before_think": 0.4091621860861778, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.75, + "completions/max_terminated_length": 620.75, + "completions/mean_length": 182.3125, + "completions/mean_terminated_length": 182.3125, + "completions/min_length": 34.25, + "completions/min_terminated_length": 34.25, + "epoch": 0.688, + "grad_norm": 5.087347030639648, + "kl": 30.1875, + "learning_rate": 5.382513867649663e-06, + "loss": 2.481, + "num_tokens": 41128852.0, + "reward": 1.4140625, + "reward_std": 0.8876871168613434, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.46566852182149887, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.43332313001155853, + "step": 1376, + "token_counts/after_target": 838.25, + "token_counts/after_think": 32.25, + "token_counts/before_target": 1592.25, + "token_counts/before_think": 454.25 + }, + { + "avg_penalty/after_target": 2.28996604681015, + "avg_penalty/after_think": 2.3427149653434753, + "avg_penalty/before_target": 0.37656833231449127, + "avg_penalty/before_think": 0.37435823678970337, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.75, + "completions/max_terminated_length": 481.75, + "completions/mean_length": 187.125, + "completions/mean_terminated_length": 187.125, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.6885, + "grad_norm": 5.885740280151367, + "kl": 10.8359375, + "learning_rate": 5.367039648801386e-06, + "loss": 1.0998, + "num_tokens": 41151356.0, + "reward": 1.8203125, + "reward_std": 0.5262105464935303, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.3186737820506096, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.20895599201321602, + "step": 1377, + "token_counts/after_target": 342.75, + "token_counts/after_think": 39.75, + "token_counts/before_target": 1663.25, + "token_counts/before_think": 948.25 + }, + { + "avg_penalty/after_target": 2.6197871565818787, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.524657629430294, + "avg_penalty/before_think": 0.40260542929172516, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 734.5, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 179.75, + "completions/mean_terminated_length": 152.21667098999023, + "completions/min_length": 37.75, + "completions/min_terminated_length": 37.75, + "epoch": 0.689, + "grad_norm": 4.009365558624268, + "kl": 24.53125, + "learning_rate": 5.351579542753808e-06, + "loss": 2.114, + "num_tokens": 41175004.0, + "reward": 1.59375, + "reward_std": 0.7684976160526276, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4057852029800415, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.3728746026754379, + "step": 1378, + "token_counts/after_target": 667.25, + "token_counts/after_think": 31.5, + "token_counts/before_target": 1657.0, + "token_counts/before_think": 520.25 + }, + { + "avg_penalty/after_target": 2.724228084087372, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.392174556851387, + "avg_penalty/before_think": 0.35741669684648514, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.0, + "completions/max_terminated_length": 700.0, + "completions/mean_length": 217.796875, + "completions/mean_terminated_length": 217.796875, + "completions/min_length": 29.5, + "completions/min_terminated_length": 29.5, + "epoch": 0.6895, + "grad_norm": 3.954836130142212, + "kl": 16.5390625, + "learning_rate": 5.336133596601089e-06, + "loss": 1.4007, + "num_tokens": 41198863.0, + "reward": 1.5, + "reward_std": 0.8245663344860077, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4515564441680908, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.3873647376894951, + "step": 1379, + "token_counts/after_target": 716.0, + "token_counts/after_think": 86.0, + "token_counts/before_target": 1921.25, + "token_counts/before_think": 761.5 + }, + { + "avg_penalty/after_target": 1.746276617050171, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4177737310528755, + "avg_penalty/before_think": 0.5669186562299728, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 584.25, + "completions/max_terminated_length": 584.25, + "completions/mean_length": 193.21875, + "completions/mean_terminated_length": 193.21875, + "completions/min_length": 31.75, + "completions/min_terminated_length": 31.75, + "epoch": 0.69, + "grad_norm": 13.495952606201172, + "kl": 24.65625, + "learning_rate": 5.3207018573942684e-06, + "loss": 1.6994, + "num_tokens": 41221453.0, + "reward": 1.42578125, + "reward_std": 0.8509394824504852, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4704566150903702, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.4139401540160179, + "step": 1380, + "token_counts/after_target": 473.5, + "token_counts/after_think": 38.0, + "token_counts/before_target": 1726.5, + "token_counts/before_think": 853.5 + }, + { + "avg_penalty/after_target": 2.8445814847946167, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3456369824707508, + "avg_penalty/before_think": 0.49587564915418625, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.25, + "completions/max_terminated_length": 454.25, + "completions/mean_length": 177.1875, + "completions/mean_terminated_length": 177.1875, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.6905, + "grad_norm": 7.056661128997803, + "kl": 10.4765625, + "learning_rate": 5.305284372141095e-06, + "loss": 1.1569, + "num_tokens": 41241001.0, + "reward": 1.73828125, + "reward_std": 0.7479286640882492, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38336414843797684, + "rewards/tag_count_reward/mean": 0.86328125, + "rewards/tag_count_reward/std": 0.33170362561941147, + "step": 1381, + "token_counts/after_target": 516.75, + "token_counts/after_think": 84.5, + "token_counts/before_target": 1555.5, + "token_counts/before_think": 678.25 + }, + { + "avg_penalty/after_target": 2.3868453204631805, + "avg_penalty/after_think": 3.7128706574440002, + "avg_penalty/before_target": 0.5030902326107025, + "avg_penalty/before_think": 0.4147441014647484, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 773.5, + "completions/max_terminated_length": 599.75, + "completions/mean_length": 173.484375, + "completions/mean_terminated_length": 160.02916717529297, + "completions/min_length": 39.75, + "completions/min_terminated_length": 39.75, + "epoch": 0.691, + "grad_norm": 7.378490924835205, + "kl": 23.15625, + "learning_rate": 5.2898811878059e-06, + "loss": 2.2754, + "num_tokens": 41262824.0, + "reward": 1.671875, + "reward_std": 0.7323294579982758, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38336414843797684, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.3553176745772362, + "step": 1382, + "token_counts/after_target": 595.0, + "token_counts/after_think": 101.5, + "token_counts/before_target": 1210.5, + "token_counts/before_think": 868.75 + }, + { + "avg_penalty/after_target": 2.3230856359004974, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.31091829016804695, + "avg_penalty/before_think": 0.47075898945331573, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.75, + "completions/max_terminated_length": 331.75, + "completions/mean_length": 128.140625, + "completions/mean_terminated_length": 128.140625, + "completions/min_length": 46.25, + "completions/min_terminated_length": 46.25, + "epoch": 0.6915, + "grad_norm": 16.430194854736328, + "kl": 17.03125, + "learning_rate": 5.274492351309462e-06, + "loss": 1.3917, + "num_tokens": 41279233.0, + "reward": 1.7265625, + "reward_std": 0.6071603894233704, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.3529609143733978, + "rewards/tag_count_reward/mean": 0.8828125, + "rewards/tag_count_reward/std": 0.26517264544963837, + "step": 1383, + "token_counts/after_target": 198.0, + "token_counts/after_think": 48.5, + "token_counts/before_target": 1281.5, + "token_counts/before_think": 522.25 + }, + { + "avg_penalty/after_target": 2.450309991836548, + "avg_penalty/after_think": 2.919463098049164, + "avg_penalty/before_target": 0.2659439519047737, + "avg_penalty/before_think": 0.37022730708122253, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 132.0, + "completions/mean_terminated_length": 132.0, + "completions/min_length": 44.75, + "completions/min_terminated_length": 44.75, + "epoch": 0.692, + "grad_norm": 17.04107093811035, + "kl": 29.375, + "learning_rate": 5.259117909528839e-06, + "loss": 2.0172, + "num_tokens": 41296305.0, + "reward": 1.52734375, + "reward_std": 0.8248002082109451, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4383598491549492, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.39398156851530075, + "step": 1384, + "token_counts/after_target": 214.75, + "token_counts/after_think": 18.75, + "token_counts/before_target": 1288.25, + "token_counts/before_think": 590.25 + }, + { + "avg_penalty/after_target": 2.987117111682892, + "avg_penalty/after_think": 3.818826198577881, + "avg_penalty/before_target": 0.2801591791212559, + "avg_penalty/before_think": 0.5801038444042206, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.25, + "completions/max_terminated_length": 485.25, + "completions/mean_length": 163.4375, + "completions/mean_terminated_length": 163.4375, + "completions/min_length": 49.25, + "completions/min_terminated_length": 49.25, + "epoch": 0.6925, + "grad_norm": 4.230436325073242, + "kl": 12.453125, + "learning_rate": 5.243757909297247e-06, + "loss": 1.141, + "num_tokens": 41317981.0, + "reward": 1.76953125, + "reward_std": 0.5318022668361664, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.2829566150903702, + "rewards/tag_count_reward/mean": 0.89453125, + "rewards/tag_count_reward/std": 0.24843144416809082, + "step": 1385, + "token_counts/after_target": 391.25, + "token_counts/after_think": 52.75, + "token_counts/before_target": 1394.75, + "token_counts/before_think": 776.25 + }, + { + "avg_penalty/after_target": 3.2493483424186707, + "avg_penalty/after_think": 3.78950434923172, + "avg_penalty/before_target": 0.3679392747581005, + "avg_penalty/before_think": 0.48958639055490494, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.5, + "completions/max_terminated_length": 473.5, + "completions/mean_length": 167.859375, + "completions/mean_terminated_length": 167.859375, + "completions/min_length": 39.5, + "completions/min_terminated_length": 39.5, + "epoch": 0.693, + "grad_norm": 7.418591499328613, + "kl": 25.625, + "learning_rate": 5.228412397403916e-06, + "loss": 2.0458, + "num_tokens": 41337780.0, + "reward": 1.5390625, + "reward_std": 0.8622758090496063, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4383598491549492, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.4127984866499901, + "step": 1386, + "token_counts/after_target": 584.25, + "token_counts/after_think": 61.0, + "token_counts/before_target": 1264.5, + "token_counts/before_think": 776.0 + }, + { + "avg_penalty/after_target": 2.908207356929779, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.34296920523047447, + "avg_penalty/before_think": 0.42904962599277496, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.75, + "completions/max_terminated_length": 562.75, + "completions/mean_length": 205.125, + "completions/mean_terminated_length": 205.125, + "completions/min_length": 62.75, + "completions/min_terminated_length": 62.75, + "epoch": 0.6935, + "grad_norm": 10.227340698242188, + "kl": 23.546875, + "learning_rate": 5.213081420593933e-06, + "loss": 1.7472, + "num_tokens": 41358012.0, + "reward": 1.515625, + "reward_std": 0.8808974772691727, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.42707233130931854, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.4159645363688469, + "step": 1387, + "token_counts/after_target": 531.0, + "token_counts/after_think": 45.5, + "token_counts/before_target": 1726.75, + "token_counts/before_think": 978.75 + }, + { + "avg_penalty/after_target": 1.9686311781406403, + "avg_penalty/after_think": 3.528765916824341, + "avg_penalty/before_target": 0.3569170832633972, + "avg_penalty/before_think": 0.6722647696733475, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.5, + "completions/max_terminated_length": 508.5, + "completions/mean_length": 165.546875, + "completions/mean_terminated_length": 165.546875, + "completions/min_length": 34.5, + "completions/min_terminated_length": 34.5, + "epoch": 0.694, + "grad_norm": 17.10271453857422, + "kl": 34.5, + "learning_rate": 5.197765025568109e-06, + "loss": 2.4934, + "num_tokens": 41377967.0, + "reward": 1.48046875, + "reward_std": 0.8542878031730652, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4604102149605751, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.4067709669470787, + "step": 1388, + "token_counts/after_target": 401.25, + "token_counts/after_think": 49.5, + "token_counts/before_target": 1831.75, + "token_counts/before_think": 366.25 + }, + { + "avg_penalty/after_target": 2.7308191657066345, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4957921877503395, + "avg_penalty/before_think": 0.4501034840941429, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 704.5, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 201.046875, + "completions/mean_terminated_length": 186.60104370117188, + "completions/min_length": 44.5, + "completions/min_terminated_length": 44.5, + "epoch": 0.6945, + "grad_norm": 5.503991603851318, + "kl": 27.375, + "learning_rate": 5.1824632589828465e-06, + "loss": 2.4636, + "num_tokens": 41400626.0, + "reward": 1.63671875, + "reward_std": 0.7280787229537964, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.37937305867671967, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.3582276925444603, + "step": 1389, + "token_counts/after_target": 1046.25, + "token_counts/after_think": 20.25, + "token_counts/before_target": 1474.5, + "token_counts/before_think": 675.75 + }, + { + "avg_penalty/after_target": 2.717615485191345, + "avg_penalty/after_think": 3.3229292035102844, + "avg_penalty/before_target": 0.30603325739502907, + "avg_penalty/before_think": 0.38652390241622925, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.25, + "completions/max_terminated_length": 468.25, + "completions/mean_length": 179.640625, + "completions/mean_terminated_length": 179.640625, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.695, + "grad_norm": 7.900355815887451, + "kl": 24.09375, + "learning_rate": 5.167176167449977e-06, + "loss": 1.818, + "num_tokens": 41420331.0, + "reward": 1.5859375, + "reward_std": 0.804242730140686, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.40303630381822586, + "step": 1390, + "token_counts/after_target": 299.0, + "token_counts/after_think": 52.75, + "token_counts/before_target": 1729.5, + "token_counts/before_think": 793.0 + }, + { + "avg_penalty/after_target": 2.5992818772792816, + "avg_penalty/after_think": 3.90117084980011, + "avg_penalty/before_target": 0.3697924241423607, + "avg_penalty/before_think": 0.5168233290314674, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.5, + "completions/max_terminated_length": 530.5, + "completions/mean_length": 141.296875, + "completions/mean_terminated_length": 141.296875, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.6955, + "grad_norm": 6.905447959899902, + "kl": 15.10693359375, + "learning_rate": 5.151903797536631e-06, + "loss": 1.653, + "num_tokens": 41440190.0, + "reward": 1.7890625, + "reward_std": 0.518342912197113, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.2596946656703949, + "rewards/tag_count_reward/mean": 0.8984375, + "rewards/tag_count_reward/std": 0.2635650783777237, + "step": 1391, + "token_counts/after_target": 369.5, + "token_counts/after_think": 101.25, + "token_counts/before_target": 1086.75, + "token_counts/before_think": 703.25 + }, + { + "avg_penalty/after_target": 2.219685733318329, + "avg_penalty/after_think": 3.4327108561992645, + "avg_penalty/before_target": 0.3448418490588665, + "avg_penalty/before_think": 0.38822925835847855, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.75, + "completions/max_terminated_length": 449.75, + "completions/mean_length": 178.171875, + "completions/mean_terminated_length": 178.171875, + "completions/min_length": 48.75, + "completions/min_terminated_length": 48.75, + "epoch": 0.696, + "grad_norm": 5.329963207244873, + "kl": 24.1875, + "learning_rate": 5.136646195765096e-06, + "loss": 1.9363, + "num_tokens": 41461673.0, + "reward": 1.625, + "reward_std": 0.7882160395383835, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3987511098384857, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.3905584514141083, + "step": 1392, + "token_counts/after_target": 430.75, + "token_counts/after_think": 88.0, + "token_counts/before_target": 1417.25, + "token_counts/before_think": 914.75 + }, + { + "avg_penalty/after_target": 2.428867071866989, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.6150490902364254, + "avg_penalty/before_think": 0.5352247506380081, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 691.25, + "completions/max_terminated_length": 534.75, + "completions/mean_length": 232.46875, + "completions/mean_terminated_length": 218.18958473205566, + "completions/min_length": 53.75, + "completions/min_terminated_length": 53.75, + "epoch": 0.6965, + "grad_norm": 7.99839448928833, + "kl": 20.53125, + "learning_rate": 5.121403408612672e-06, + "loss": 2.0418, + "num_tokens": 41486391.0, + "reward": 1.640625, + "reward_std": 0.7299756705760956, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.33854540809988976, + "step": 1393, + "token_counts/after_target": 1010.5, + "token_counts/after_think": 18.0, + "token_counts/before_target": 1604.0, + "token_counts/before_think": 1087.0 + }, + { + "avg_penalty/after_target": 2.9004727602005005, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.34932539984583855, + "avg_penalty/before_think": 0.47401322424411774, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 740.5, + "completions/max_terminated_length": 644.25, + "completions/mean_length": 223.515625, + "completions/mean_terminated_length": 212.13021087646484, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.697, + "grad_norm": 5.396336078643799, + "kl": 24.375, + "learning_rate": 5.106175482511537e-06, + "loss": 2.1603, + "num_tokens": 41511448.0, + "reward": 1.4921875, + "reward_std": 0.8029935210943222, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.46513500809669495, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.3650682270526886, + "step": 1394, + "token_counts/after_target": 727.75, + "token_counts/after_think": 80.25, + "token_counts/before_target": 1804.5, + "token_counts/before_think": 963.75 + }, + { + "avg_penalty/after_target": 2.7997984290122986, + "avg_penalty/after_think": 3.8228753805160522, + "avg_penalty/before_target": 0.3126254715025425, + "avg_penalty/before_think": 0.44586097449064255, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.75, + "completions/max_terminated_length": 476.75, + "completions/mean_length": 176.171875, + "completions/mean_terminated_length": 176.171875, + "completions/min_length": 46.75, + "completions/min_terminated_length": 46.75, + "epoch": 0.6975, + "grad_norm": 3.456916332244873, + "kl": 20.046875, + "learning_rate": 5.090962463848592e-06, + "loss": 1.7631, + "num_tokens": 41531955.0, + "reward": 1.69140625, + "reward_std": 0.7453366369009018, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.14789126068353653, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4097762927412987, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.34889380633831024, + "step": 1395, + "token_counts/after_target": 507.5, + "token_counts/after_think": 44.0, + "token_counts/before_target": 1417.75, + "token_counts/before_think": 849.5 + }, + { + "avg_penalty/after_target": 3.1368011832237244, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.43984486907720566, + "avg_penalty/before_think": 0.3778705261647701, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 179.46875, + "completions/mean_terminated_length": 179.46875, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.698, + "grad_norm": 3.32346248626709, + "kl": 23.9921875, + "learning_rate": 5.075764398965331e-06, + "loss": 2.0945, + "num_tokens": 41553409.0, + "reward": 1.515625, + "reward_std": 0.7922046184539795, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4414467439055443, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.3559853583574295, + "step": 1396, + "token_counts/after_target": 592.5, + "token_counts/after_think": 31.75, + "token_counts/before_target": 1645.25, + "token_counts/before_think": 602.0 + }, + { + "avg_penalty/after_target": 2.7338168621063232, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.43194035813212395, + "avg_penalty/before_think": 0.41424787044525146, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 640.0, + "completions/max_terminated_length": 541.5, + "completions/mean_length": 199.328125, + "completions/mean_terminated_length": 187.4093780517578, + "completions/min_length": 42.5, + "completions/min_terminated_length": 42.5, + "epoch": 0.6985, + "grad_norm": 6.140399932861328, + "kl": 21.46875, + "learning_rate": 5.060581334157693e-06, + "loss": 1.8515, + "num_tokens": 41575254.0, + "reward": 1.57421875, + "reward_std": 0.7740562707185745, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42733466625213623, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.3564517982304096, + "step": 1397, + "token_counts/after_target": 786.0, + "token_counts/after_think": 17.5, + "token_counts/before_target": 1537.75, + "token_counts/before_think": 848.0 + }, + { + "avg_penalty/after_target": 2.256540298461914, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.5264798179268837, + "avg_penalty/before_think": 0.4650377631187439, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 698.0, + "completions/max_terminated_length": 557.75, + "completions/mean_length": 211.453125, + "completions/mean_terminated_length": 197.6343765258789, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.699, + "grad_norm": 8.963830947875977, + "kl": 31.5625, + "learning_rate": 5.045413315675925e-06, + "loss": 2.4191, + "num_tokens": 41597747.0, + "reward": 1.37109375, + "reward_std": 0.8465841561555862, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.4423432722687721, + "rewards/tag_count_reward/mean": 0.69921875, + "rewards/tag_count_reward/std": 0.43321602791547775, + "step": 1398, + "token_counts/after_target": 718.0, + "token_counts/after_think": 23.75, + "token_counts/before_target": 1917.0, + "token_counts/before_think": 724.5 + }, + { + "avg_penalty/after_target": 2.062474638223648, + "avg_penalty/after_think": 2.527696967124939, + "avg_penalty/before_target": 0.41515500470995903, + "avg_penalty/before_think": 0.4143695682287216, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 623.5, + "completions/max_terminated_length": 623.5, + "completions/mean_length": 196.375, + "completions/mean_terminated_length": 196.375, + "completions/min_length": 51.5, + "completions/min_terminated_length": 51.5, + "epoch": 0.6995, + "grad_norm": 5.719552516937256, + "kl": 20.5859375, + "learning_rate": 5.030260389724447e-06, + "loss": 1.6104, + "num_tokens": 41623099.0, + "reward": 1.6015625, + "reward_std": 0.8122911900281906, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.14789126068353653, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.41898179799318314, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.3581402376294136, + "step": 1399, + "token_counts/after_target": 505.75, + "token_counts/after_think": 27.0, + "token_counts/before_target": 1666.0, + "token_counts/before_think": 943.25 + }, + { + "avg_penalty/after_target": 2.4108914136886597, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4469895288348198, + "avg_penalty/before_think": 0.5594375282526016, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 756.25, + "completions/max_terminated_length": 667.25, + "completions/mean_length": 293.515625, + "completions/mean_terminated_length": 281.7354202270508, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.7, + "grad_norm": 6.398295879364014, + "kl": 22.171875, + "learning_rate": 5.015122602461698e-06, + "loss": 1.7573, + "num_tokens": 41650620.0, + "reward": 1.44921875, + "reward_std": 0.8191997408866882, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.45129410922527313, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.38798724114894867, + "step": 1400, + "token_counts/after_target": 976.5, + "token_counts/after_think": 76.0, + "token_counts/before_target": 2844.0, + "token_counts/before_think": 799.75 + }, + { + "avg_penalty/after_target": 2.362504243850708, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3710588291287422, + "avg_penalty/before_think": 0.4636441022157669, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 735.25, + "completions/max_terminated_length": 646.25, + "completions/mean_length": 234.6875, + "completions/mean_terminated_length": 222.3218765258789, + "completions/min_length": 49.75, + "completions/min_terminated_length": 49.75, + "epoch": 0.7005, + "grad_norm": 4.365045070648193, + "kl": 27.03125, + "learning_rate": 5.000000000000003e-06, + "loss": 2.2194, + "num_tokens": 41675304.0, + "reward": 1.52734375, + "reward_std": 0.8517717719078064, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.44721361994743347, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.41199180483818054, + "step": 1401, + "token_counts/after_target": 855.25, + "token_counts/after_think": 35.0, + "token_counts/before_target": 2171.75, + "token_counts/before_think": 693.0 + }, + { + "avg_penalty/after_target": 2.6336810886859894, + "avg_penalty/after_think": 1.7592735290527344, + "avg_penalty/before_target": 0.41789373755455017, + "avg_penalty/before_think": 0.4590187296271324, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 667.0, + "completions/max_terminated_length": 667.0, + "completions/mean_length": 254.90625, + "completions/mean_terminated_length": 254.90625, + "completions/min_length": 43.25, + "completions/min_terminated_length": 43.25, + "epoch": 0.701, + "grad_norm": 4.937894821166992, + "kl": 26.625, + "learning_rate": 4.984892628405426e-06, + "loss": 2.1702, + "num_tokens": 41704722.0, + "reward": 1.359375, + "reward_std": 0.8834948539733887, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.4665650501847267, + "rewards/tag_count_reward/mean": 0.703125, + "rewards/tag_count_reward/std": 0.4305638447403908, + "step": 1402, + "token_counts/after_target": 880.25, + "token_counts/after_think": 122.75, + "token_counts/before_target": 1720.25, + "token_counts/before_think": 1355.25 + }, + { + "avg_penalty/after_target": 2.5484835505485535, + "avg_penalty/after_think": 1.8458421230316162, + "avg_penalty/before_target": 0.43725433200597763, + "avg_penalty/before_think": 0.48503535240888596, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 537.5, + "completions/max_terminated_length": 380.75, + "completions/mean_length": 168.765625, + "completions/mean_terminated_length": 155.54792022705078, + "completions/min_length": 31.75, + "completions/min_terminated_length": 31.75, + "epoch": 0.7015, + "grad_norm": 5.721954345703125, + "kl": 15.859375, + "learning_rate": 4.96980053369765e-06, + "loss": 1.447, + "num_tokens": 41725811.0, + "reward": 1.4921875, + "reward_std": 0.84184131026268, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44187305867671967, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.40662769973278046, + "step": 1403, + "token_counts/after_target": 581.75, + "token_counts/after_think": 13.0, + "token_counts/before_target": 1418.75, + "token_counts/before_think": 686.75 + }, + { + "avg_penalty/after_target": 2.543591320514679, + "avg_penalty/after_think": 3.525978922843933, + "avg_penalty/before_target": 0.37325458973646164, + "avg_penalty/before_think": 0.47461915761232376, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 578.5, + "completions/max_terminated_length": 578.5, + "completions/mean_length": 247.453125, + "completions/mean_terminated_length": 247.453125, + "completions/min_length": 36.5, + "completions/min_terminated_length": 36.5, + "epoch": 0.702, + "grad_norm": 3.84466552734375, + "kl": 16.578125, + "learning_rate": 4.954723761849809e-06, + "loss": 1.5125, + "num_tokens": 41755472.0, + "reward": 1.515625, + "reward_std": 0.7567418366670609, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.45028156042099, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.3346497341990471, + "step": 1404, + "token_counts/after_target": 690.75, + "token_counts/after_think": 27.75, + "token_counts/before_target": 2101.25, + "token_counts/before_think": 1139.5 + }, + { + "avg_penalty/after_target": 2.7132941484451294, + "avg_penalty/after_think": 2.849231719970703, + "avg_penalty/before_target": 0.40091440826654434, + "avg_penalty/before_think": 0.4075036309659481, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 677.5, + "completions/max_terminated_length": 664.5, + "completions/mean_length": 290.625, + "completions/mean_terminated_length": 280.92708587646484, + "completions/min_length": 54.5, + "completions/min_terminated_length": 54.5, + "epoch": 0.7025, + "grad_norm": 3.58198618888855, + "kl": 22.8125, + "learning_rate": 4.939662358788364e-06, + "loss": 1.8666, + "num_tokens": 41784440.0, + "reward": 1.38671875, + "reward_std": 0.8903416395187378, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.49244368076324463, + "rewards/tag_count_reward/mean": 0.73046875, + "rewards/tag_count_reward/std": 0.4157659560441971, + "step": 1405, + "token_counts/after_target": 917.5, + "token_counts/after_think": 120.0, + "token_counts/before_target": 2747.25, + "token_counts/before_think": 865.25 + }, + { + "avg_penalty/after_target": 2.2894636392593384, + "avg_penalty/after_think": 3.405830502510071, + "avg_penalty/before_target": 0.5487493351101875, + "avg_penalty/before_think": 0.6178779900074005, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 697.5, + "completions/max_terminated_length": 667.75, + "completions/mean_length": 252.265625, + "completions/mean_terminated_length": 241.88229370117188, + "completions/min_length": 48.25, + "completions/min_terminated_length": 48.25, + "epoch": 0.703, + "grad_norm": 12.642423629760742, + "kl": 18.28125, + "learning_rate": 4.924616370392962e-06, + "loss": 1.9516, + "num_tokens": 41810265.0, + "reward": 1.59375, + "reward_std": 0.739254891872406, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4229728877544403, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.3437676727771759, + "step": 1406, + "token_counts/after_target": 1009.0, + "token_counts/after_think": 120.75, + "token_counts/before_target": 2105.25, + "token_counts/before_think": 801.25 + }, + { + "avg_penalty/after_target": 2.1581310033798218, + "avg_penalty/after_think": 3.913166344165802, + "avg_penalty/before_target": 0.5459484979510307, + "avg_penalty/before_think": 0.48080870509147644, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 811.5, + "completions/max_terminated_length": 739.0, + "completions/mean_length": 265.875, + "completions/mean_terminated_length": 253.81042098999023, + "completions/min_length": 48.25, + "completions/min_terminated_length": 48.25, + "epoch": 0.7035, + "grad_norm": 8.15610122680664, + "kl": 15.296875, + "learning_rate": 4.909585842496287e-06, + "loss": 1.5854, + "num_tokens": 41837425.0, + "reward": 1.48046875, + "reward_std": 0.7979313731193542, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4581565484404564, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.34506169706583023, + "step": 1407, + "token_counts/after_target": 1051.5, + "token_counts/after_think": 62.0, + "token_counts/before_target": 2084.5, + "token_counts/before_think": 1056.0 + }, + { + "avg_penalty/after_target": 2.461997389793396, + "avg_penalty/after_think": 3.513781189918518, + "avg_penalty/before_target": 0.4187328442931175, + "avg_penalty/before_think": 0.49957606941461563, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 692.5, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 232.859375, + "completions/mean_terminated_length": 219.58854293823242, + "completions/min_length": 52.5, + "completions/min_terminated_length": 52.5, + "epoch": 0.704, + "grad_norm": 8.690862655639648, + "kl": 20.71875, + "learning_rate": 4.894570820883943e-06, + "loss": 2.0353, + "num_tokens": 41861368.0, + "reward": 1.52734375, + "reward_std": 0.7955252677202225, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4598134011030197, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.3599461577832699, + "step": 1408, + "token_counts/after_target": 766.5, + "token_counts/after_think": 130.0, + "token_counts/before_target": 1719.5, + "token_counts/before_think": 1109.75 + }, + { + "avg_penalty/after_target": 3.0297043919563293, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.2957178205251694, + "avg_penalty/before_think": 0.5997487157583237, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 615.25, + "completions/max_terminated_length": 615.25, + "completions/mean_length": 280.15625, + "completions/mean_terminated_length": 280.15625, + "completions/min_length": 58.5, + "completions/min_terminated_length": 58.5, + "epoch": 0.7045, + "grad_norm": 5.789785385131836, + "kl": 18.234375, + "learning_rate": 4.879571351294287e-06, + "loss": 1.4416, + "num_tokens": 41889890.0, + "reward": 1.5546875, + "reward_std": 0.7510001063346863, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.41419370472431183, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.36160848289728165, + "step": 1409, + "token_counts/after_target": 745.75, + "token_counts/after_think": 48.0, + "token_counts/before_target": 2555.75, + "token_counts/before_think": 1133.0 + }, + { + "avg_penalty/after_target": 2.439093232154846, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.38951098173856735, + "avg_penalty/before_think": 0.5016332864761353, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 688.0, + "completions/max_terminated_length": 596.25, + "completions/mean_length": 289.34375, + "completions/mean_terminated_length": 277.7229232788086, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.705, + "grad_norm": 4.819460868835449, + "kl": 14.921875, + "learning_rate": 4.864587479418302e-06, + "loss": 1.4554, + "num_tokens": 41917720.0, + "reward": 1.515625, + "reward_std": 0.7500269114971161, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.42206869274377823, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.3371509313583374, + "step": 1410, + "token_counts/after_target": 872.5, + "token_counts/after_think": 77.0, + "token_counts/before_target": 2569.25, + "token_counts/before_think": 1110.75 + }, + { + "avg_penalty/after_target": 2.0224817991256714, + "avg_penalty/after_think": 3.879969596862793, + "avg_penalty/before_target": 0.5367254912853241, + "avg_penalty/before_think": 0.7261801809072495, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 859.0, + "completions/max_terminated_length": 762.0, + "completions/mean_length": 290.46875, + "completions/mean_terminated_length": 277.0729179382324, + "completions/min_length": 48.25, + "completions/min_terminated_length": 48.25, + "epoch": 0.7055, + "grad_norm": 2.8116676807403564, + "kl": 23.78125, + "learning_rate": 4.849619250899458e-06, + "loss": 2.0758, + "num_tokens": 41950470.0, + "reward": 1.55078125, + "reward_std": 0.7654954344034195, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4380975142121315, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.3717714175581932, + "step": 1411, + "token_counts/after_target": 1128.0, + "token_counts/after_think": 22.0, + "token_counts/before_target": 1931.75, + "token_counts/before_think": 1565.75 + }, + { + "avg_penalty/after_target": 2.3942347168922424, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.39037978649139404, + "avg_penalty/before_think": 0.449823334813118, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.25, + "completions/max_terminated_length": 591.25, + "completions/mean_length": 251.375, + "completions/mean_terminated_length": 251.375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.706, + "grad_norm": 5.789560794830322, + "kl": 17.609375, + "learning_rate": 4.8346667113335824e-06, + "loss": 1.6521, + "num_tokens": 41975998.0, + "reward": 1.56640625, + "reward_std": 0.7531820833683014, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44187305867671967, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.341947041451931, + "step": 1412, + "token_counts/after_target": 735.0, + "token_counts/after_think": 139.75, + "token_counts/before_target": 2027.0, + "token_counts/before_think": 1120.25 + }, + { + "avg_penalty/after_target": 2.454556792974472, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.5553543791174889, + "avg_penalty/before_think": 0.5899386182427406, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 949.5, + "completions/max_terminated_length": 811.25, + "completions/mean_length": 371.0, + "completions/mean_terminated_length": 339.10194396972656, + "completions/min_length": 69.5, + "completions/min_terminated_length": 69.5, + "epoch": 0.7065, + "grad_norm": 2.6575241088867188, + "kl": 23.546875, + "learning_rate": 4.8197299062687e-06, + "loss": 2.0109, + "num_tokens": 42009022.0, + "reward": 1.33984375, + "reward_std": 0.8436812907457352, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.46566852182149887, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.4075435549020767, + "step": 1413, + "token_counts/after_target": 1550.5, + "token_counts/after_think": 244.0, + "token_counts/before_target": 3087.5, + "token_counts/before_think": 1054.0 + }, + { + "avg_penalty/after_target": 1.6965875029563904, + "avg_penalty/after_think": 3.9960983395576477, + "avg_penalty/before_target": 0.3854678049683571, + "avg_penalty/before_think": 0.5789965391159058, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 848.25, + "completions/max_terminated_length": 775.75, + "completions/mean_length": 298.34375, + "completions/mean_terminated_length": 286.42083740234375, + "completions/min_length": 61.5, + "completions/min_terminated_length": 61.5, + "epoch": 0.707, + "grad_norm": 6.69114351272583, + "kl": 19.4375, + "learning_rate": 4.80480888120491e-06, + "loss": 1.5355, + "num_tokens": 42036740.0, + "reward": 1.55859375, + "reward_std": 0.8423373103141785, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42867646366357803, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.400500051677227, + "step": 1414, + "token_counts/after_target": 751.75, + "token_counts/after_think": 71.25, + "token_counts/before_target": 2052.75, + "token_counts/before_think": 1897.75 + }, + { + "avg_penalty/after_target": 2.1636393666267395, + "avg_penalty/after_think": 3.985909879207611, + "avg_penalty/before_target": 0.38075222820043564, + "avg_penalty/before_think": 0.5776500552892685, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 734.5, + "completions/max_terminated_length": 734.5, + "completions/mean_length": 288.78125, + "completions/mean_terminated_length": 288.78125, + "completions/min_length": 30.75, + "completions/min_terminated_length": 30.75, + "epoch": 0.7075, + "grad_norm": 8.912707328796387, + "kl": 24.0625, + "learning_rate": 4.78990368159424e-06, + "loss": 1.8522, + "num_tokens": 42064374.0, + "reward": 1.4296875, + "reward_std": 0.8239044100046158, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.47770625352859497, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.39089177548885345, + "step": 1415, + "token_counts/after_target": 707.0, + "token_counts/after_think": 69.5, + "token_counts/before_target": 2625.0, + "token_counts/before_think": 1219.0 + }, + { + "avg_penalty/after_target": 2.6047224700450897, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3851246200501919, + "avg_penalty/before_think": 0.8650360405445099, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 621.0, + "completions/max_terminated_length": 621.0, + "completions/mean_length": 247.234375, + "completions/mean_terminated_length": 247.234375, + "completions/min_length": 51.25, + "completions/min_terminated_length": 51.25, + "epoch": 0.708, + "grad_norm": 2.8111469745635986, + "kl": 23.59375, + "learning_rate": 4.775014352840512e-06, + "loss": 2.0128, + "num_tokens": 42090293.0, + "reward": 1.453125, + "reward_std": 0.8134089410305023, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.4692344516515732, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.37871627509593964, + "step": 1416, + "token_counts/after_target": 1027.5, + "token_counts/after_think": 76.75, + "token_counts/before_target": 2169.75, + "token_counts/before_think": 681.75 + }, + { + "avg_penalty/after_target": 2.8097967505455017, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.40445462614297867, + "avg_penalty/before_think": 0.5200299844145775, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 645.25, + "completions/max_terminated_length": 541.5, + "completions/mean_length": 225.109375, + "completions/mean_terminated_length": 213.03646087646484, + "completions/min_length": 39.75, + "completions/min_terminated_length": 39.75, + "epoch": 0.7085, + "grad_norm": 6.012775897979736, + "kl": 23.28125, + "learning_rate": 4.76014094029921e-06, + "loss": 2.1157, + "num_tokens": 42118924.0, + "reward": 1.46875, + "reward_std": 0.8108867108821869, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4414467439055443, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.3877381533384323, + "step": 1417, + "token_counts/after_target": 802.25, + "token_counts/after_think": 100.75, + "token_counts/before_target": 1739.75, + "token_counts/before_think": 959.0 + }, + { + "avg_penalty/after_target": 2.019952893257141, + "avg_penalty/after_think": 3.7942484617233276, + "avg_penalty/before_target": 0.4538426622748375, + "avg_penalty/before_think": 0.4687949866056442, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 731.5, + "completions/max_terminated_length": 508.5, + "completions/mean_length": 198.828125, + "completions/mean_terminated_length": 173.07917022705078, + "completions/min_length": 37.25, + "completions/min_terminated_length": 37.25, + "epoch": 0.709, + "grad_norm": 4.108648777008057, + "kl": 21.578125, + "learning_rate": 4.745283489277324e-06, + "loss": 1.7955, + "num_tokens": 42142849.0, + "reward": 1.55078125, + "reward_std": 0.7918997555971146, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.43655145168304443, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.3675876036286354, + "step": 1418, + "token_counts/after_target": 468.75, + "token_counts/after_think": 61.75, + "token_counts/before_target": 1629.5, + "token_counts/before_think": 1021.25 + }, + { + "avg_penalty/after_target": 2.4382691085338593, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3731708601117134, + "avg_penalty/before_think": 0.5098995193839073, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 663.75, + "completions/max_terminated_length": 600.5, + "completions/mean_length": 252.875, + "completions/mean_terminated_length": 240.1739616394043, + "completions/min_length": 48.75, + "completions/min_terminated_length": 48.75, + "epoch": 0.7095, + "grad_norm": 4.747226715087891, + "kl": 22.921875, + "learning_rate": 4.7304420450332244e-06, + "loss": 1.9274, + "num_tokens": 42170329.0, + "reward": 1.4921875, + "reward_std": 0.8319491446018219, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4598134011030197, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.39382562786340714, + "step": 1419, + "token_counts/after_target": 733.75, + "token_counts/after_think": 117.25, + "token_counts/before_target": 1798.5, + "token_counts/before_think": 1396.5 + }, + { + "avg_penalty/after_target": 2.5340352058410645, + "avg_penalty/after_think": 3.920959711074829, + "avg_penalty/before_target": 0.5258857905864716, + "avg_penalty/before_think": 0.6174918636679649, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 869.0, + "completions/max_terminated_length": 716.0, + "completions/mean_length": 346.046875, + "completions/mean_terminated_length": 325.0791778564453, + "completions/min_length": 47.25, + "completions/min_terminated_length": 47.25, + "epoch": 0.71, + "grad_norm": 6.927793979644775, + "kl": 21.4765625, + "learning_rate": 4.71561665277653e-06, + "loss": 2.0519, + "num_tokens": 42203036.0, + "reward": 1.375, + "reward_std": 0.7346866279840469, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.4215351790189743, + "rewards/tag_count_reward/mean": 0.71875, + "rewards/tag_count_reward/std": 0.3327867239713669, + "step": 1420, + "token_counts/after_target": 1389.25, + "token_counts/after_think": 396.25, + "token_counts/before_target": 2584.75, + "token_counts/before_think": 1166.5 + }, + { + "avg_penalty/after_target": 2.102713942527771, + "avg_penalty/after_think": 1.6329143643379211, + "avg_penalty/before_target": 0.2852865047752857, + "avg_penalty/before_think": 0.541108712553978, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.75, + "completions/max_terminated_length": 528.75, + "completions/mean_length": 204.890625, + "completions/mean_terminated_length": 204.890625, + "completions/min_length": 58.25, + "completions/min_terminated_length": 58.25, + "epoch": 0.7105, + "grad_norm": 5.314045429229736, + "kl": 20.5, + "learning_rate": 4.700807357667953e-06, + "loss": 1.5916, + "num_tokens": 42225189.0, + "reward": 1.41015625, + "reward_std": 0.877328172326088, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.47354350984096527, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.42797113209962845, + "step": 1421, + "token_counts/after_target": 474.0, + "token_counts/after_think": 61.75, + "token_counts/before_target": 1915.0, + "token_counts/before_think": 827.5 + }, + { + "avg_penalty/after_target": 2.5745869576931, + "avg_penalty/after_think": 3.639721632003784, + "avg_penalty/before_target": 0.37376539036631584, + "avg_penalty/before_think": 0.5509519726037979, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.5, + "completions/max_terminated_length": 614.5, + "completions/mean_length": 293.484375, + "completions/mean_terminated_length": 293.484375, + "completions/min_length": 56.75, + "completions/min_terminated_length": 56.75, + "epoch": 0.711, + "grad_norm": 4.5175275802612305, + "kl": 17.328125, + "learning_rate": 4.686014204819171e-06, + "loss": 1.587, + "num_tokens": 42253588.0, + "reward": 1.46484375, + "reward_std": 0.8045262843370438, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4581565484404564, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.3734908662736416, + "step": 1422, + "token_counts/after_target": 976.0, + "token_counts/after_think": 157.0, + "token_counts/before_target": 2222.5, + "token_counts/before_think": 1340.25 + }, + { + "avg_penalty/after_target": 2.4080966114997864, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.5001992732286453, + "avg_penalty/before_think": 0.5665568932890892, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 862.5, + "completions/max_terminated_length": 697.5, + "completions/mean_length": 290.328125, + "completions/mean_terminated_length": 256.36355209350586, + "completions/min_length": 34.75, + "completions/min_terminated_length": 34.75, + "epoch": 0.7115, + "grad_norm": 5.211231708526611, + "kl": 22.71875, + "learning_rate": 4.671237239292699e-06, + "loss": 2.0641, + "num_tokens": 42281337.0, + "reward": 1.5390625, + "reward_std": 0.7783290296792984, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.41898179799318314, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.37503284215927124, + "step": 1423, + "token_counts/after_target": 1185.5, + "token_counts/after_think": 131.0, + "token_counts/before_target": 2526.75, + "token_counts/before_think": 802.0 + }, + { + "avg_penalty/after_target": 3.498226225376129, + "avg_penalty/after_think": 2.453747034072876, + "avg_penalty/before_target": 0.33813541010022163, + "avg_penalty/before_think": 0.5007347539067268, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 196.671875, + "completions/mean_terminated_length": 196.671875, + "completions/min_length": 45.25, + "completions/min_terminated_length": 45.25, + "epoch": 0.712, + "grad_norm": 9.635819435119629, + "kl": 15.375, + "learning_rate": 4.656476506101737e-06, + "loss": 1.5671, + "num_tokens": 42304724.0, + "reward": 1.57421875, + "reward_std": 0.8141016811132431, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.401825986802578, + "step": 1424, + "token_counts/after_target": 573.25, + "token_counts/after_think": 64.25, + "token_counts/before_target": 1571.0, + "token_counts/before_think": 938.25 + }, + { + "avg_penalty/after_target": 1.7003082931041718, + "avg_penalty/after_think": 3.983573079109192, + "avg_penalty/before_target": 0.5271663963794708, + "avg_penalty/before_think": 0.4873204678297043, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 740.0, + "completions/max_terminated_length": 740.0, + "completions/mean_length": 216.3125, + "completions/mean_terminated_length": 216.3125, + "completions/min_length": 46.25, + "completions/min_terminated_length": 46.25, + "epoch": 0.7125, + "grad_norm": 4.60776948928833, + "kl": 23.0, + "learning_rate": 4.641732050210032e-06, + "loss": 2.1187, + "num_tokens": 42327096.0, + "reward": 1.57421875, + "reward_std": 0.7607488930225372, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4079566150903702, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.3646041080355644, + "step": 1425, + "token_counts/after_target": 652.75, + "token_counts/after_think": 92.75, + "token_counts/before_target": 1943.5, + "token_counts/before_think": 772.0 + }, + { + "avg_penalty/after_target": 2.5929354429244995, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.43503329157829285, + "avg_penalty/before_think": 0.6834732741117477, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 718.0, + "completions/max_terminated_length": 718.0, + "completions/mean_length": 307.421875, + "completions/mean_terminated_length": 307.421875, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.713, + "grad_norm": 2.4979987144470215, + "kl": 21.78125, + "learning_rate": 4.627003916531761e-06, + "loss": 1.8485, + "num_tokens": 42356211.0, + "reward": 1.3828125, + "reward_std": 0.8836726695299149, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.47663040459156036, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.4321054592728615, + "step": 1426, + "token_counts/after_target": 1176.75, + "token_counts/after_think": 83.75, + "token_counts/before_target": 2568.5, + "token_counts/before_think": 1089.75 + }, + { + "avg_penalty/after_target": 2.25736927986145, + "avg_penalty/after_think": 2.6956287622451782, + "avg_penalty/before_target": 0.4115145206451416, + "avg_penalty/before_think": 0.5359682105481625, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.5, + "completions/max_terminated_length": 529.5, + "completions/mean_length": 215.40625, + "completions/mean_terminated_length": 215.40625, + "completions/min_length": 55.75, + "completions/min_terminated_length": 55.75, + "epoch": 0.7135, + "grad_norm": 3.139948606491089, + "kl": 23.71875, + "learning_rate": 4.612292149931369e-06, + "loss": 2.0807, + "num_tokens": 42379485.0, + "reward": 1.49609375, + "reward_std": 0.861174687743187, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4519384130835533, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.4131026193499565, + "step": 1427, + "token_counts/after_target": 733.5, + "token_counts/after_think": 132.75, + "token_counts/before_target": 1781.75, + "token_counts/before_think": 798.5 + }, + { + "avg_penalty/after_target": 2.029896467924118, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3699520602822304, + "avg_penalty/before_think": 0.5186657533049583, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.0, + "completions/max_terminated_length": 636.0, + "completions/mean_length": 259.15625, + "completions/mean_terminated_length": 259.15625, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.714, + "grad_norm": 6.7510085105896, + "kl": 23.8125, + "learning_rate": 4.59759679522345e-06, + "loss": 1.8651, + "num_tokens": 42403911.0, + "reward": 1.4609375, + "reward_std": 0.8214051574468613, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.45283494144678116, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.38187648728489876, + "step": 1428, + "token_counts/after_target": 764.5, + "token_counts/after_think": 98.0, + "token_counts/before_target": 2179.0, + "token_counts/before_think": 1105.0 + }, + { + "avg_penalty/after_target": 2.2986570298671722, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.2674196697771549, + "avg_penalty/before_think": 0.470166340470314, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.25, + "completions/max_terminated_length": 589.25, + "completions/mean_length": 230.328125, + "completions/mean_terminated_length": 230.328125, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.7145, + "grad_norm": 4.974922180175781, + "kl": 12.57421875, + "learning_rate": 4.582917897172603e-06, + "loss": 1.1425, + "num_tokens": 42427612.0, + "reward": 1.6796875, + "reward_std": 0.5014205425977707, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.30717839300632477, + "rewards/tag_count_reward/mean": 0.8671875, + "rewards/tag_count_reward/std": 0.20803537219762802, + "step": 1429, + "token_counts/after_target": 288.75, + "token_counts/after_think": 98.25, + "token_counts/before_target": 2395.0, + "token_counts/before_think": 903.25 + }, + { + "avg_penalty/after_target": 2.87814062833786, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3802384212613106, + "avg_penalty/before_think": 0.4359150826931, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 181.328125, + "completions/mean_terminated_length": 181.328125, + "completions/min_length": 43.5, + "completions/min_terminated_length": 43.5, + "epoch": 0.715, + "grad_norm": 5.458690166473389, + "kl": 15.671875, + "learning_rate": 4.568255500493292e-06, + "loss": 1.5372, + "num_tokens": 42448673.0, + "reward": 1.61328125, + "reward_std": 0.7322626560926437, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.3842606768012047, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.35467904806137085, + "step": 1430, + "token_counts/after_target": 484.5, + "token_counts/after_think": 179.75, + "token_counts/before_target": 1584.75, + "token_counts/before_think": 652.25 + }, + { + "avg_penalty/after_target": 2.7002410888671875, + "avg_penalty/after_think": 3.92475962638855, + "avg_penalty/before_target": 0.35451651364564896, + "avg_penalty/before_think": 0.5659860149025917, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.5, + "completions/max_terminated_length": 531.5, + "completions/mean_length": 193.234375, + "completions/mean_terminated_length": 193.234375, + "completions/min_length": 40.25, + "completions/min_terminated_length": 40.25, + "epoch": 0.7155, + "grad_norm": 3.034884214401245, + "kl": 14.27490234375, + "learning_rate": 4.5536096498497295e-06, + "loss": 1.3308, + "num_tokens": 42473744.0, + "reward": 1.73828125, + "reward_std": 0.5645482689142227, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.30233466625213623, + "rewards/tag_count_reward/mean": 0.87890625, + "rewards/tag_count_reward/std": 0.26438118517398834, + "step": 1431, + "token_counts/after_target": 453.0, + "token_counts/after_think": 38.5, + "token_counts/before_target": 1711.0, + "token_counts/before_think": 889.25 + }, + { + "avg_penalty/after_target": 2.9405550360679626, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.40524018555879593, + "avg_penalty/before_think": 0.4545874521136284, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 649.0, + "completions/max_terminated_length": 638.25, + "completions/mean_length": 244.5, + "completions/mean_terminated_length": 232.94895935058594, + "completions/min_length": 52.75, + "completions/min_terminated_length": 52.75, + "epoch": 0.716, + "grad_norm": 3.1372008323669434, + "kl": 24.40625, + "learning_rate": 4.538980389855711e-06, + "loss": 2.1116, + "num_tokens": 42499088.0, + "reward": 1.35546875, + "reward_std": 0.8112072199583054, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.46875541657209396, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.3827076368033886, + "step": 1432, + "token_counts/after_target": 1167.75, + "token_counts/after_think": 33.5, + "token_counts/before_target": 2061.75, + "token_counts/before_think": 649.0 + }, + { + "avg_penalty/after_target": 2.6175352334976196, + "avg_penalty/after_think": 2.593670666217804, + "avg_penalty/before_target": 0.3954157568514347, + "avg_penalty/before_think": 0.5326363518834114, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 785.5, + "completions/max_terminated_length": 624.0, + "completions/mean_length": 210.4375, + "completions/mean_terminated_length": 197.0343780517578, + "completions/min_length": 49.75, + "completions/min_terminated_length": 49.75, + "epoch": 0.7165, + "grad_norm": 3.144979953765869, + "kl": 14.2890625, + "learning_rate": 4.524367765074499e-06, + "loss": 1.2533, + "num_tokens": 42523580.0, + "reward": 1.61328125, + "reward_std": 0.7202936038374901, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.37675637751817703, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.3508804887533188, + "step": 1433, + "token_counts/after_target": 662.25, + "token_counts/after_think": 41.5, + "token_counts/before_target": 1641.25, + "token_counts/before_think": 1022.0 + }, + { + "avg_penalty/after_target": 2.7250159978866577, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.5880558639764786, + "avg_penalty/before_think": 0.42589712888002396, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 822.25, + "completions/max_terminated_length": 709.0, + "completions/mean_length": 284.390625, + "completions/mean_terminated_length": 271.95208740234375, + "completions/min_length": 51.75, + "completions/min_terminated_length": 51.75, + "epoch": 0.717, + "grad_norm": 10.750720977783203, + "kl": 25.0, + "learning_rate": 4.509771820018682e-06, + "loss": 2.0595, + "num_tokens": 42554805.0, + "reward": 1.46484375, + "reward_std": 0.8467600047588348, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4634971097111702, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.40204037725925446, + "step": 1434, + "token_counts/after_target": 1178.25, + "token_counts/after_think": 116.25, + "token_counts/before_target": 2059.0, + "token_counts/before_think": 1196.75 + }, + { + "avg_penalty/after_target": 2.872521698474884, + "avg_penalty/after_think": 2.978949546813965, + "avg_penalty/before_target": 0.429660439491272, + "avg_penalty/before_think": 0.5581483021378517, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 708.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 201.234375, + "completions/mean_terminated_length": 188.21041870117188, + "completions/min_length": 36.5, + "completions/min_terminated_length": 36.5, + "epoch": 0.7175, + "grad_norm": 5.626804351806641, + "kl": 19.625, + "learning_rate": 4.495192599150045e-06, + "loss": 1.8706, + "num_tokens": 42576356.0, + "reward": 1.6015625, + "reward_std": 0.7496596425771713, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.39964763820171356, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.3601987361907959, + "step": 1435, + "token_counts/after_target": 664.25, + "token_counts/after_think": 41.5, + "token_counts/before_target": 1478.0, + "token_counts/before_think": 1036.0 + }, + { + "avg_penalty/after_target": 2.8738667964935303, + "avg_penalty/after_think": 2.9994669556617737, + "avg_penalty/before_target": 0.37105177715420723, + "avg_penalty/before_think": 0.7016143649816513, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 796.5, + "completions/max_terminated_length": 655.0, + "completions/mean_length": 269.5, + "completions/mean_terminated_length": 242.4352684020996, + "completions/min_length": 57.75, + "completions/min_terminated_length": 57.75, + "epoch": 0.718, + "grad_norm": 4.329591751098633, + "kl": 26.0625, + "learning_rate": 4.480630146879419e-06, + "loss": 2.2289, + "num_tokens": 42603508.0, + "reward": 1.43359375, + "reward_std": 0.8531976193189621, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.48148179799318314, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.39712485671043396, + "step": 1436, + "token_counts/after_target": 982.75, + "token_counts/after_think": 169.75, + "token_counts/before_target": 1881.0, + "token_counts/before_think": 1278.5 + }, + { + "avg_penalty/after_target": 2.3872043192386627, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.36785733699798584, + "avg_penalty/before_think": 0.4465983137488365, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.0, + "completions/max_terminated_length": 565.0, + "completions/mean_length": 173.09375, + "completions/mean_terminated_length": 173.09375, + "completions/min_length": 40.75, + "completions/min_terminated_length": 40.75, + "epoch": 0.7185, + "grad_norm": 10.76624584197998, + "kl": 26.1796875, + "learning_rate": 4.46608450756656e-06, + "loss": 1.9389, + "num_tokens": 42625578.0, + "reward": 1.44140625, + "reward_std": 0.7959918826818466, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.436277836561203, + "rewards/tag_count_reward/mean": 0.75390625, + "rewards/tag_count_reward/std": 0.37562207132577896, + "step": 1437, + "token_counts/after_target": 582.0, + "token_counts/after_think": 17.75, + "token_counts/before_target": 1642.75, + "token_counts/before_think": 527.0 + }, + { + "avg_penalty/after_target": 3.12281197309494, + "avg_penalty/after_think": 2.9666196703910828, + "avg_penalty/before_target": 0.22403454035520554, + "avg_penalty/before_think": 0.4064020439982414, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.5, + "completions/max_terminated_length": 445.5, + "completions/mean_length": 169.859375, + "completions/mean_terminated_length": 169.859375, + "completions/min_length": 45.5, + "completions/min_terminated_length": 45.5, + "epoch": 0.719, + "grad_norm": 2.406238555908203, + "kl": 12.6875, + "learning_rate": 4.451555725520009e-06, + "loss": 1.0888, + "num_tokens": 42645185.0, + "reward": 1.62890625, + "reward_std": 0.7082875669002533, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.3014807254076004, + "step": 1438, + "token_counts/after_target": 367.75, + "token_counts/after_think": 130.25, + "token_counts/before_target": 1076.25, + "token_counts/before_think": 1143.5 + }, + { + "avg_penalty/after_target": 2.0072684288024902, + "avg_penalty/after_think": 2.7457683086395264, + "avg_penalty/before_target": 0.37974704429507256, + "avg_penalty/before_think": 0.6235622242093086, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 696.75, + "completions/max_terminated_length": 603.25, + "completions/mean_length": 245.9375, + "completions/mean_terminated_length": 234.31771087646484, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.7195, + "grad_norm": 6.371795654296875, + "kl": 19.4375, + "learning_rate": 4.437043844996952e-06, + "loss": 1.5123, + "num_tokens": 42670637.0, + "reward": 1.60546875, + "reward_std": 0.7856876999139786, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.38175978511571884, + "step": 1439, + "token_counts/after_target": 712.25, + "token_counts/after_think": 48.25, + "token_counts/before_target": 2078.0, + "token_counts/before_think": 1096.5 + }, + { + "avg_penalty/after_target": 2.4946780800819397, + "avg_penalty/after_think": 2.8108969926834106, + "avg_penalty/before_target": 0.27397671714425087, + "avg_penalty/before_think": 0.4390180930495262, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 182.234375, + "completions/mean_terminated_length": 182.234375, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.72, + "grad_norm": 8.431096076965332, + "kl": 23.875, + "learning_rate": 4.422548910203099e-06, + "loss": 1.835, + "num_tokens": 42695052.0, + "reward": 1.48828125, + "reward_std": 0.7975000888109207, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.43925637751817703, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.3684206008911133, + "step": 1440, + "token_counts/after_target": 373.75, + "token_counts/after_think": 80.0, + "token_counts/before_target": 1723.0, + "token_counts/before_think": 739.0 + }, + { + "avg_penalty/after_target": 3.222500801086426, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.28607307747006416, + "avg_penalty/before_think": 0.5056113079190254, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 538.5, + "completions/max_terminated_length": 538.5, + "completions/mean_length": 227.015625, + "completions/mean_terminated_length": 227.015625, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.7205, + "grad_norm": 2.5293309688568115, + "kl": 15.953125, + "learning_rate": 4.408070965292534e-06, + "loss": 1.4317, + "num_tokens": 42721885.0, + "reward": 1.5703125, + "reward_std": 0.750467836856842, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4260597825050354, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.3440818786621094, + "step": 1441, + "token_counts/after_target": 789.5, + "token_counts/after_think": 67.0, + "token_counts/before_target": 1673.75, + "token_counts/before_think": 1102.0 + }, + { + "avg_penalty/after_target": 2.846423923969269, + "avg_penalty/after_think": 2.6333433389663696, + "avg_penalty/before_target": 0.2617679201066494, + "avg_penalty/before_think": 0.5509348660707474, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.5, + "completions/max_terminated_length": 478.5, + "completions/mean_length": 194.859375, + "completions/mean_terminated_length": 194.859375, + "completions/min_length": 46.5, + "completions/min_terminated_length": 46.5, + "epoch": 0.721, + "grad_norm": 3.8977303504943848, + "kl": 17.15625, + "learning_rate": 4.393610054367585e-06, + "loss": 1.4878, + "num_tokens": 42742756.0, + "reward": 1.5859375, + "reward_std": 0.7863476872444153, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4075859263539314, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.3854871690273285, + "step": 1442, + "token_counts/after_target": 518.25, + "token_counts/after_think": 39.25, + "token_counts/before_target": 1477.25, + "token_counts/before_think": 1083.0 + }, + { + "avg_penalty/after_target": 2.2793740332126617, + "avg_penalty/after_think": 2.5669846534729004, + "avg_penalty/before_target": 0.26422516629099846, + "avg_penalty/before_think": 0.5020812451839447, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.75, + "completions/max_terminated_length": 414.75, + "completions/mean_length": 150.109375, + "completions/mean_terminated_length": 150.109375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.7215, + "grad_norm": 3.1535959243774414, + "kl": 15.28125, + "learning_rate": 4.379166221478697e-06, + "loss": 1.3817, + "num_tokens": 42764459.0, + "reward": 1.703125, + "reward_std": 0.7019822746515274, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.3723389655351639, + "rewards/tag_count_reward/mean": 0.859375, + "rewards/tag_count_reward/std": 0.3296433351933956, + "step": 1443, + "token_counts/after_target": 256.75, + "token_counts/after_think": 67.0, + "token_counts/before_target": 1208.0, + "token_counts/before_think": 870.0 + }, + { + "avg_penalty/after_target": 2.3093358874320984, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.29817642644047737, + "avg_penalty/before_think": 0.4882565066218376, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 182.3125, + "completions/mean_terminated_length": 182.3125, + "completions/min_length": 25.5, + "completions/min_terminated_length": 25.5, + "epoch": 0.722, + "grad_norm": 5.280372142791748, + "kl": 19.4921875, + "learning_rate": 4.3647395106242864e-06, + "loss": 1.5177, + "num_tokens": 42784687.0, + "reward": 1.48828125, + "reward_std": 0.7818250507116318, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4106728211045265, + "rewards/tag_count_reward/mean": 0.75390625, + "rewards/tag_count_reward/std": 0.37477005273103714, + "step": 1444, + "token_counts/after_target": 368.0, + "token_counts/after_think": 87.75, + "token_counts/before_target": 1506.0, + "token_counts/before_think": 955.25 + }, + { + "avg_penalty/after_target": 1.7414018213748932, + "avg_penalty/after_think": 3.9762189984321594, + "avg_penalty/before_target": 0.2985404208302498, + "avg_penalty/before_think": 0.40001608431339264, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.5, + "completions/max_terminated_length": 545.5, + "completions/mean_length": 159.953125, + "completions/mean_terminated_length": 159.953125, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.7225, + "grad_norm": 3.6892457008361816, + "kl": 21.40625, + "learning_rate": 4.350329965750622e-06, + "loss": 1.7791, + "num_tokens": 42802460.0, + "reward": 1.56640625, + "reward_std": 0.7534366697072983, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.40316852182149887, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.3638009652495384, + "step": 1445, + "token_counts/after_target": 330.25, + "token_counts/after_think": 56.75, + "token_counts/before_target": 1392.0, + "token_counts/before_think": 780.25 + }, + { + "avg_penalty/after_target": 2.9889012277126312, + "avg_penalty/after_think": 2.99094158411026, + "avg_penalty/before_target": 0.36456580087542534, + "avg_penalty/before_think": 0.5420196354389191, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.25, + "completions/max_terminated_length": 607.25, + "completions/mean_length": 197.65625, + "completions/mean_terminated_length": 197.65625, + "completions/min_length": 42.25, + "completions/min_terminated_length": 42.25, + "epoch": 0.723, + "grad_norm": 10.629471778869629, + "kl": 21.453125, + "learning_rate": 4.335937630751675e-06, + "loss": 2.1531, + "num_tokens": 42825942.0, + "reward": 1.61328125, + "reward_std": 0.7557225078344345, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.36207935214042664, + "step": 1446, + "token_counts/after_target": 781.25, + "token_counts/after_think": 157.75, + "token_counts/before_target": 1625.25, + "token_counts/before_think": 598.25 + }, + { + "avg_penalty/after_target": 1.7434136867523193, + "avg_penalty/after_think": 1.9972543716430664, + "avg_penalty/before_target": 0.503445953130722, + "avg_penalty/before_think": 0.5027566254138947, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 649.25, + "completions/max_terminated_length": 649.25, + "completions/mean_length": 205.484375, + "completions/mean_terminated_length": 205.484375, + "completions/min_length": 53.75, + "completions/min_terminated_length": 53.75, + "epoch": 0.7235, + "grad_norm": 2.655890703201294, + "kl": 25.09375, + "learning_rate": 4.321562549468991e-06, + "loss": 2.0739, + "num_tokens": 42847365.0, + "reward": 1.5234375, + "reward_std": 0.8466605991125107, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4383598491549492, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.4136684983968735, + "step": 1447, + "token_counts/after_target": 515.5, + "token_counts/after_think": 59.25, + "token_counts/before_target": 1742.75, + "token_counts/before_think": 970.25 + }, + { + "avg_penalty/after_target": 2.7452891767024994, + "avg_penalty/after_think": 3.950077474117279, + "avg_penalty/before_target": 0.31527454406023026, + "avg_penalty/before_think": 0.6873941347002983, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.5, + "completions/max_terminated_length": 501.5, + "completions/mean_length": 200.90625, + "completions/mean_terminated_length": 200.90625, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.724, + "grad_norm": 3.20141339302063, + "kl": 15.541015625, + "learning_rate": 4.307204765691559e-06, + "loss": 1.3589, + "num_tokens": 42868431.0, + "reward": 1.56640625, + "reward_std": 0.7705256640911102, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4079566150903702, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.3740403801202774, + "step": 1448, + "token_counts/after_target": 503.75, + "token_counts/after_think": 143.75, + "token_counts/before_target": 1775.75, + "token_counts/before_think": 791.25 + }, + { + "avg_penalty/after_target": 1.8879208266735077, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3964399993419647, + "avg_penalty/before_think": 0.4672197699546814, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 190.578125, + "completions/mean_terminated_length": 190.578125, + "completions/min_length": 45.5, + "completions/min_terminated_length": 45.5, + "epoch": 0.7245, + "grad_norm": 3.385934829711914, + "kl": 16.609375, + "learning_rate": 4.292864323155684e-06, + "loss": 1.5433, + "num_tokens": 42894548.0, + "reward": 1.6875, + "reward_std": 0.7188471555709839, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38772592693567276, + "rewards/tag_count_reward/mean": 0.859375, + "rewards/tag_count_reward/std": 0.3470490500330925, + "step": 1449, + "token_counts/after_target": 493.25, + "token_counts/after_think": 38.0, + "token_counts/before_target": 1687.5, + "token_counts/before_think": 830.5 + }, + { + "avg_penalty/after_target": 2.75615057349205, + "avg_penalty/after_think": 2.927916169166565, + "avg_penalty/before_target": 0.526754267513752, + "avg_penalty/before_think": 0.6541759222745895, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 720.25, + "completions/max_terminated_length": 720.25, + "completions/mean_length": 253.5625, + "completions/mean_terminated_length": 253.5625, + "completions/min_length": 31.75, + "completions/min_terminated_length": 31.75, + "epoch": 0.725, + "grad_norm": 4.992326736450195, + "kl": 22.96875, + "learning_rate": 4.27854126554484e-06, + "loss": 2.1158, + "num_tokens": 42919832.0, + "reward": 1.4140625, + "reward_std": 0.8688340932130814, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4613594636321068, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.4167100489139557, + "step": 1450, + "token_counts/after_target": 1001.0, + "token_counts/after_think": 32.25, + "token_counts/before_target": 2196.5, + "token_counts/before_think": 827.25 + }, + { + "avg_penalty/after_target": 2.184095650911331, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.40467412769794464, + "avg_penalty/before_think": 0.4010799080133438, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 194.09375, + "completions/mean_terminated_length": 194.09375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.7255, + "grad_norm": 2.8182878494262695, + "kl": 13.546875, + "learning_rate": 4.264235636489542e-06, + "loss": 1.245, + "num_tokens": 42941710.0, + "reward": 1.71875, + "reward_std": 0.8004322648048401, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.11180340498685837, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.39123913645744324, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.35645319521427155, + "step": 1451, + "token_counts/after_target": 490.25, + "token_counts/after_think": 58.5, + "token_counts/before_target": 1520.25, + "token_counts/before_think": 1036.5 + }, + { + "avg_penalty/after_target": 1.7762271761894226, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3608385920524597, + "avg_penalty/before_think": 0.4627635031938553, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.5, + "completions/max_terminated_length": 631.5, + "completions/mean_length": 210.375, + "completions/mean_terminated_length": 210.375, + "completions/min_length": 48.5, + "completions/min_terminated_length": 48.5, + "epoch": 0.726, + "grad_norm": 7.010539531707764, + "kl": 17.765625, + "learning_rate": 4.249947479567218e-06, + "loss": 1.3106, + "num_tokens": 42964006.0, + "reward": 1.625, + "reward_std": 0.7857866436243057, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4097762927412987, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.3792026564478874, + "step": 1452, + "token_counts/after_target": 314.25, + "token_counts/after_think": 29.5, + "token_counts/before_target": 1842.0, + "token_counts/before_think": 1180.25 + }, + { + "avg_penalty/after_target": 3.209086537361145, + "avg_penalty/after_think": 2.6370972394943237, + "avg_penalty/before_target": 0.34410519897937775, + "avg_penalty/before_think": 0.4535277634859085, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.75, + "completions/max_terminated_length": 592.75, + "completions/mean_length": 211.640625, + "completions/mean_terminated_length": 211.640625, + "completions/min_length": 43.75, + "completions/min_terminated_length": 43.75, + "epoch": 0.7265, + "grad_norm": 6.5050435066223145, + "kl": 16.359375, + "learning_rate": 4.235676838302069e-06, + "loss": 1.5595, + "num_tokens": 42987119.0, + "reward": 1.640625, + "reward_std": 0.7929850667715073, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4141380712389946, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.3772086650133133, + "step": 1453, + "token_counts/after_target": 541.0, + "token_counts/after_think": 65.5, + "token_counts/before_target": 1217.75, + "token_counts/before_think": 1562.0 + }, + { + "avg_penalty/after_target": 2.7522096037864685, + "avg_penalty/after_think": 2.8125879764556885, + "avg_penalty/before_target": 0.3158136121928692, + "avg_penalty/before_think": 0.41963991522789, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 620.25, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 202.953125, + "completions/mean_terminated_length": 189.49583435058594, + "completions/min_length": 45.5, + "completions/min_terminated_length": 45.5, + "epoch": 0.727, + "grad_norm": 8.950922012329102, + "kl": 15.875, + "learning_rate": 4.221423756164949e-06, + "loss": 1.6699, + "num_tokens": 43008844.0, + "reward": 1.72265625, + "reward_std": 0.6143714934587479, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38336414843797684, + "rewards/tag_count_reward/mean": 0.89453125, + "rewards/tag_count_reward/std": 0.23870792612433434, + "step": 1454, + "token_counts/after_target": 549.25, + "token_counts/after_think": 54.5, + "token_counts/before_target": 1769.25, + "token_counts/before_think": 874.25 + }, + { + "avg_penalty/after_target": 2.72207048535347, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3441852703690529, + "avg_penalty/before_think": 0.5374482423067093, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 189.734375, + "completions/mean_terminated_length": 189.734375, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.7275, + "grad_norm": 8.962912559509277, + "kl": 12.109375, + "learning_rate": 4.207188276573214e-06, + "loss": 1.3686, + "num_tokens": 43030827.0, + "reward": 1.765625, + "reward_std": 0.5825984627008438, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3265564441680908, + "rewards/tag_count_reward/mean": 0.890625, + "rewards/tag_count_reward/std": 0.257498100399971, + "step": 1455, + "token_counts/after_target": 451.5, + "token_counts/after_think": 46.5, + "token_counts/before_target": 1618.75, + "token_counts/before_think": 919.0 + }, + { + "avg_penalty/after_target": 2.2552627623081207, + "avg_penalty/after_think": 1.9945812821388245, + "avg_penalty/before_target": 0.5642417818307877, + "avg_penalty/before_think": 0.6034197807312012, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 806.25, + "completions/max_terminated_length": 717.5, + "completions/mean_length": 273.921875, + "completions/mean_terminated_length": 250.22084045410156, + "completions/min_length": 28.5, + "completions/min_terminated_length": 28.5, + "epoch": 0.728, + "grad_norm": 3.7989072799682617, + "kl": 29.25, + "learning_rate": 4.192970442890602e-06, + "loss": 2.4365, + "num_tokens": 43059094.0, + "reward": 1.3046875, + "reward_std": 0.9069146066904068, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.4939897432923317, + "rewards/tag_count_reward/mean": 0.6953125, + "rewards/tag_count_reward/std": 0.4447778984904289, + "step": 1456, + "token_counts/after_target": 1295.75, + "token_counts/after_think": 132.5, + "token_counts/before_target": 2121.0, + "token_counts/before_think": 833.5 + }, + { + "avg_penalty/after_target": 2.1614875495433807, + "avg_penalty/after_think": 3.599396526813507, + "avg_penalty/before_target": 0.42265991121530533, + "avg_penalty/before_think": 0.43146654963493347, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 720.0, + "completions/max_terminated_length": 720.0, + "completions/mean_length": 272.265625, + "completions/mean_terminated_length": 272.265625, + "completions/min_length": 44.75, + "completions/min_terminated_length": 44.75, + "epoch": 0.7285, + "grad_norm": 2.7975761890411377, + "kl": 25.625, + "learning_rate": 4.178770298427107e-06, + "loss": 2.186, + "num_tokens": 43085543.0, + "reward": 1.66015625, + "reward_std": 0.8686263859272003, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.11967839300632477, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.3991628438234329, + "step": 1457, + "token_counts/after_target": 1000.75, + "token_counts/after_think": 46.75, + "token_counts/before_target": 2337.75, + "token_counts/before_think": 971.0 + }, + { + "avg_penalty/after_target": 2.5075678527355194, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4866771548986435, + "avg_penalty/before_think": 0.46517805755138397, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 829.25, + "completions/max_terminated_length": 829.25, + "completions/mean_length": 299.453125, + "completions/mean_terminated_length": 299.453125, + "completions/min_length": 52.25, + "completions/min_terminated_length": 52.25, + "epoch": 0.729, + "grad_norm": 5.236783504486084, + "kl": 21.875, + "learning_rate": 4.164587886438827e-06, + "loss": 2.0388, + "num_tokens": 43114052.0, + "reward": 1.6171875, + "reward_std": 0.7946110665798187, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.3706955462694168, + "step": 1458, + "token_counts/after_target": 1115.0, + "token_counts/after_think": 60.0, + "token_counts/before_target": 2012.75, + "token_counts/before_think": 1603.5 + }, + { + "avg_penalty/after_target": 2.5327949821949005, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3844889923930168, + "avg_penalty/before_think": 0.4555800184607506, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 626.5, + "completions/max_terminated_length": 626.5, + "completions/mean_length": 236.921875, + "completions/mean_terminated_length": 236.921875, + "completions/min_length": 49.25, + "completions/min_terminated_length": 49.25, + "epoch": 0.7295, + "grad_norm": 4.902469158172607, + "kl": 27.34375, + "learning_rate": 4.150423250127846e-06, + "loss": 2.2641, + "num_tokens": 43138143.0, + "reward": 1.5078125, + "reward_std": 0.8413091599941254, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44187305867671967, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.41328903287649155, + "step": 1459, + "token_counts/after_target": 785.25, + "token_counts/after_think": 29.5, + "token_counts/before_target": 1926.75, + "token_counts/before_think": 1049.25 + }, + { + "avg_penalty/after_target": 2.490949034690857, + "avg_penalty/after_think": 3.996749460697174, + "avg_penalty/before_target": 0.5814541652798653, + "avg_penalty/before_think": 0.5442570149898529, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 659.5, + "completions/max_terminated_length": 659.5, + "completions/mean_length": 201.984375, + "completions/mean_terminated_length": 201.984375, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.73, + "grad_norm": 3.541585922241211, + "kl": 26.125, + "learning_rate": 4.136276432642107e-06, + "loss": 2.3308, + "num_tokens": 43160334.0, + "reward": 1.55078125, + "reward_std": 0.8163826912641525, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4462348371744156, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.3861427530646324, + "step": 1460, + "token_counts/after_target": 730.5, + "token_counts/after_think": 212.5, + "token_counts/before_target": 1420.75, + "token_counts/before_think": 868.0 + }, + { + "avg_penalty/after_target": 2.41901695728302, + "avg_penalty/after_think": 3.8209343552589417, + "avg_penalty/before_target": 0.38047171756625175, + "avg_penalty/before_think": 0.5126832649111748, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.25, + "completions/max_terminated_length": 620.25, + "completions/mean_length": 217.921875, + "completions/mean_terminated_length": 217.921875, + "completions/min_length": 51.25, + "completions/min_terminated_length": 51.25, + "epoch": 0.7305, + "grad_norm": 2.9438488483428955, + "kl": 21.46875, + "learning_rate": 4.12214747707527e-06, + "loss": 1.8301, + "num_tokens": 43184713.0, + "reward": 1.546875, + "reward_std": 0.833513468503952, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4519384130835533, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.37533868849277496, + "step": 1461, + "token_counts/after_target": 502.75, + "token_counts/after_think": 135.0, + "token_counts/before_target": 1627.75, + "token_counts/before_think": 1221.25 + }, + { + "avg_penalty/after_target": 2.416069597005844, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4093478061258793, + "avg_penalty/before_think": 0.4270844906568527, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.5, + "completions/max_terminated_length": 561.5, + "completions/mean_length": 231.484375, + "completions/mean_terminated_length": 231.484375, + "completions/min_length": 45.25, + "completions/min_terminated_length": 45.25, + "epoch": 0.731, + "grad_norm": 4.191717147827148, + "kl": 22.59375, + "learning_rate": 4.108036426466577e-06, + "loss": 1.8727, + "num_tokens": 43209448.0, + "reward": 1.50390625, + "reward_std": 0.8331011533737183, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4462348371744156, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.4043387398123741, + "step": 1462, + "token_counts/after_target": 900.25, + "token_counts/after_think": 96.0, + "token_counts/before_target": 1822.0, + "token_counts/before_think": 885.5 + }, + { + "avg_penalty/after_target": 1.5939196944236755, + "avg_penalty/after_think": 3.965541958808899, + "avg_penalty/before_target": 0.5260341018438339, + "avg_penalty/before_think": 0.5357344374060631, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 713.75, + "completions/max_terminated_length": 713.75, + "completions/mean_length": 229.421875, + "completions/mean_terminated_length": 229.421875, + "completions/min_length": 28.75, + "completions/min_terminated_length": 28.75, + "epoch": 0.7315, + "grad_norm": 10.703903198242188, + "kl": 26.9609375, + "learning_rate": 4.093943323800746e-06, + "loss": 2.0341, + "num_tokens": 43236259.0, + "reward": 1.47265625, + "reward_std": 0.8637381792068481, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4339347705245018, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.4302605614066124, + "step": 1463, + "token_counts/after_target": 800.25, + "token_counts/after_think": 115.75, + "token_counts/before_target": 1659.0, + "token_counts/before_think": 1095.75 + }, + { + "avg_penalty/after_target": 2.766759753227234, + "avg_penalty/after_think": 0.9310582876205444, + "avg_penalty/before_target": 0.29992376267910004, + "avg_penalty/before_think": 0.41131072118878365, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.5, + "completions/max_terminated_length": 486.5, + "completions/mean_length": 210.6875, + "completions/mean_terminated_length": 210.6875, + "completions/min_length": 51.25, + "completions/min_terminated_length": 51.25, + "epoch": 0.732, + "grad_norm": 5.198312282562256, + "kl": 18.296875, + "learning_rate": 4.0798682120078046e-06, + "loss": 1.4586, + "num_tokens": 43264575.0, + "reward": 1.63671875, + "reward_std": 0.6891620755195618, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4000816270709038, + "rewards/tag_count_reward/mean": 0.85546875, + "rewards/tag_count_reward/std": 0.31543198227882385, + "step": 1464, + "token_counts/after_target": 529.75, + "token_counts/after_think": 7.75, + "token_counts/before_target": 1702.0, + "token_counts/before_think": 1131.5 + }, + { + "avg_penalty/after_target": 2.7988889515399933, + "avg_penalty/after_think": 3.528304159641266, + "avg_penalty/before_target": 0.4787951707839966, + "avg_penalty/before_think": 0.6875415146350861, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 710.75, + "completions/max_terminated_length": 637.75, + "completions/mean_length": 222.921875, + "completions/mean_terminated_length": 211.4781265258789, + "completions/min_length": 50.5, + "completions/min_terminated_length": 50.5, + "epoch": 0.7325, + "grad_norm": 4.882826328277588, + "kl": 22.609375, + "learning_rate": 4.065811133962987e-06, + "loss": 1.8929, + "num_tokens": 43290218.0, + "reward": 1.484375, + "reward_std": 0.8718949556350708, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45247192680835724, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.4125816524028778, + "step": 1465, + "token_counts/after_target": 817.0, + "token_counts/after_think": 60.5, + "token_counts/before_target": 1460.75, + "token_counts/before_think": 1228.5 + }, + { + "avg_penalty/after_target": 2.591688811779022, + "avg_penalty/after_think": 3.6863654255867004, + "avg_penalty/before_target": 0.4905667193233967, + "avg_penalty/before_think": 0.584722638130188, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 668.0, + "completions/max_terminated_length": 659.25, + "completions/mean_length": 221.75, + "completions/mean_terminated_length": 209.39479446411133, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.733, + "grad_norm": 7.775198459625244, + "kl": 17.3095703125, + "learning_rate": 4.051772132486589e-06, + "loss": 1.8192, + "num_tokens": 43313386.0, + "reward": 1.63671875, + "reward_std": 0.5898780971765518, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.32528156042099, + "rewards/tag_count_reward/mean": 0.83984375, + "rewards/tag_count_reward/std": 0.27792633324861526, + "step": 1466, + "token_counts/after_target": 790.25, + "token_counts/after_think": 81.0, + "token_counts/before_target": 1661.75, + "token_counts/before_think": 1015.0 + }, + { + "avg_penalty/after_target": 2.5433870255947113, + "avg_penalty/after_think": 3.279472231864929, + "avg_penalty/before_target": 0.38626354932785034, + "avg_penalty/before_think": 0.6220911741256714, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 749.0, + "completions/max_terminated_length": 658.75, + "completions/mean_length": 249.8125, + "completions/mean_terminated_length": 237.20312881469727, + "completions/min_length": 34.5, + "completions/min_terminated_length": 34.5, + "epoch": 0.7335, + "grad_norm": 6.889985084533691, + "kl": 22.65625, + "learning_rate": 4.037751250343841e-06, + "loss": 2.167, + "num_tokens": 43341038.0, + "reward": 1.58984375, + "reward_std": 0.7964596003293991, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.42080147564411163, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.39046309143304825, + "step": 1467, + "token_counts/after_target": 702.75, + "token_counts/after_think": 160.25, + "token_counts/before_target": 1735.75, + "token_counts/before_think": 1398.25 + }, + { + "avg_penalty/after_target": 1.9898539185523987, + "avg_penalty/after_think": 2.8785189986228943, + "avg_penalty/before_target": 0.37184396386146545, + "avg_penalty/before_think": 0.3467683456838131, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.25, + "completions/max_terminated_length": 430.25, + "completions/mean_length": 147.375, + "completions/mean_terminated_length": 147.375, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.734, + "grad_norm": 4.141942024230957, + "kl": 23.78125, + "learning_rate": 4.023748530244789e-06, + "loss": 1.9562, + "num_tokens": 43361254.0, + "reward": 1.59765625, + "reward_std": 0.8076972514390945, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.41194770485162735, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.382351852953434, + "step": 1468, + "token_counts/after_target": 269.25, + "token_counts/after_think": 68.0, + "token_counts/before_target": 1170.0, + "token_counts/before_think": 850.75 + }, + { + "avg_penalty/after_target": 2.5534071028232574, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4519936963915825, + "avg_penalty/before_think": 0.4158422574400902, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.75, + "completions/max_terminated_length": 707.75, + "completions/mean_length": 223.78125, + "completions/mean_terminated_length": 223.78125, + "completions/min_length": 43.75, + "completions/min_terminated_length": 43.75, + "epoch": 0.7345, + "grad_norm": 7.331518173217773, + "kl": 24.03125, + "learning_rate": 4.009764014844143e-06, + "loss": 2.2015, + "num_tokens": 43384152.0, + "reward": 1.54296875, + "reward_std": 0.8386952728033066, + "rewards/accuracy_reward/mean": NaN, + "rewards/accuracy_reward/std": NaN, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42867646366357803, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.412555567920208, + "step": 1469, + "token_counts/after_target": 874.75, + "token_counts/after_think": 48.5, + "token_counts/before_target": 1652.0, + "token_counts/before_think": 1005.25 + }, + { + "avg_penalty/after_target": 2.070741593837738, + "avg_penalty/after_think": 2.980908691883087, + "avg_penalty/before_target": 0.5127030648291111, + "avg_penalty/before_think": 0.4521429054439068, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 674.75, + "completions/max_terminated_length": 601.25, + "completions/mean_length": 201.84375, + "completions/mean_terminated_length": 189.91875457763672, + "completions/min_length": 33.5, + "completions/min_terminated_length": 33.5, + "epoch": 0.735, + "grad_norm": 5.101244926452637, + "kl": 28.46875, + "learning_rate": 3.9957977467411615e-06, + "loss": 2.2826, + "num_tokens": 43408590.0, + "reward": 1.4296875, + "reward_std": 0.862227588891983, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.46566852182149887, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.4125082641839981, + "step": 1470, + "token_counts/after_target": 728.0, + "token_counts/after_think": 21.25, + "token_counts/before_target": 1697.25, + "token_counts/before_think": 783.0 + }, + { + "avg_penalty/after_target": 2.0217381417751312, + "avg_penalty/after_think": 2.850955307483673, + "avg_penalty/before_target": 0.27425719425082207, + "avg_penalty/before_think": 0.49617841094732285, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.25, + "completions/max_terminated_length": 549.25, + "completions/mean_length": 199.59375, + "completions/mean_terminated_length": 199.59375, + "completions/min_length": 46.75, + "completions/min_terminated_length": 46.75, + "epoch": 0.7355, + "grad_norm": 4.166291236877441, + "kl": 19.75, + "learning_rate": 3.981849768479516e-06, + "loss": 1.5795, + "num_tokens": 43430196.0, + "reward": 1.47265625, + "reward_std": 0.8192173987627029, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4713720977306366, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.37165719270706177, + "step": 1471, + "token_counts/after_target": 366.0, + "token_counts/after_think": 137.75, + "token_counts/before_target": 1584.75, + "token_counts/before_think": 1105.0 + }, + { + "avg_penalty/after_target": 2.7680151760578156, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.33584823086857796, + "avg_penalty/before_think": 0.4516856260597706, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 649.25, + "completions/max_terminated_length": 649.25, + "completions/mean_length": 191.625, + "completions/mean_terminated_length": 191.625, + "completions/min_length": 29.5, + "completions/min_terminated_length": 29.5, + "epoch": 0.736, + "grad_norm": 6.995062828063965, + "kl": 22.6875, + "learning_rate": 3.967920122547175e-06, + "loss": 2.0969, + "num_tokens": 43453916.0, + "reward": 1.63671875, + "reward_std": 0.7750662118196487, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3987511098384857, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.3777644783258438, + "step": 1472, + "token_counts/after_target": 517.25, + "token_counts/after_think": 91.0, + "token_counts/before_target": 1308.25, + "token_counts/before_think": 1149.5 + }, + { + "avg_penalty/after_target": 2.139346957206726, + "avg_penalty/after_think": 3.7079604864120483, + "avg_penalty/before_target": 0.41801418364048004, + "avg_penalty/before_think": 0.41802243143320084, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.25, + "completions/max_terminated_length": 425.25, + "completions/mean_length": 152.234375, + "completions/mean_terminated_length": 152.234375, + "completions/min_length": 36.5, + "completions/min_terminated_length": 36.5, + "epoch": 0.7365, + "grad_norm": 6.786215782165527, + "kl": 13.140625, + "learning_rate": 3.954008851376252e-06, + "loss": 1.3377, + "num_tokens": 43473467.0, + "reward": 1.7734375, + "reward_std": 0.5571784228086472, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3375816270709038, + "rewards/tag_count_reward/mean": 0.9140625, + "rewards/tag_count_reward/std": 0.23486477881669998, + "step": 1473, + "token_counts/after_target": 300.0, + "token_counts/after_think": 96.25, + "token_counts/before_target": 1188.0, + "token_counts/before_think": 851.5 + }, + { + "avg_penalty/after_target": 1.8245288133621216, + "avg_penalty/after_think": 3.9187321066856384, + "avg_penalty/before_target": 0.4380270093679428, + "avg_penalty/before_think": 0.6844813302159309, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 860.25, + "completions/max_terminated_length": 724.75, + "completions/mean_length": 265.765625, + "completions/mean_terminated_length": 242.55417251586914, + "completions/min_length": 34.25, + "completions/min_terminated_length": 34.25, + "epoch": 0.737, + "grad_norm": 4.609158515930176, + "kl": 27.8125, + "learning_rate": 3.940115997342892e-06, + "loss": 2.285, + "num_tokens": 43501084.0, + "reward": 1.4609375, + "reward_std": 0.7980498224496841, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.47669370472431183, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.34954067319631577, + "step": 1474, + "token_counts/after_target": 961.75, + "token_counts/after_think": 135.75, + "token_counts/before_target": 2142.0, + "token_counts/before_think": 1012.75 + }, + { + "avg_penalty/after_target": 2.3616052865982056, + "avg_penalty/after_think": 3.546493709087372, + "avg_penalty/before_target": 0.37079083919525146, + "avg_penalty/before_think": 0.47467611730098724, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 583.75, + "completions/max_terminated_length": 583.75, + "completions/mean_length": 209.671875, + "completions/mean_terminated_length": 209.671875, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.7375, + "grad_norm": 6.224817752838135, + "kl": 18.4248046875, + "learning_rate": 3.9262416027671354e-06, + "loss": 1.8075, + "num_tokens": 43523191.0, + "reward": 1.62109375, + "reward_std": 0.6719994992017746, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.36489029973745346, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.31492840498685837, + "step": 1475, + "token_counts/after_target": 756.5, + "token_counts/after_think": 46.25, + "token_counts/before_target": 1894.0, + "token_counts/before_think": 658.0 + }, + { + "avg_penalty/after_target": 3.5759994983673096, + "avg_penalty/after_think": 2.8965539932250977, + "avg_penalty/before_target": 0.22237468883395195, + "avg_penalty/before_think": 0.5519060045480728, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.0, + "completions/max_terminated_length": 599.0, + "completions/mean_length": 205.546875, + "completions/mean_terminated_length": 205.546875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.738, + "grad_norm": 3.066092014312744, + "kl": 20.765625, + "learning_rate": 3.912385709912794e-06, + "loss": 1.8464, + "num_tokens": 43547386.0, + "reward": 1.54296875, + "reward_std": 0.8121281862258911, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4440634250640869, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.3834483399987221, + "step": 1476, + "token_counts/after_target": 501.0, + "token_counts/after_think": 174.5, + "token_counts/before_target": 1772.0, + "token_counts/before_think": 841.25 + }, + { + "avg_penalty/after_target": 1.9526348412036896, + "avg_penalty/after_think": 2.19866681098938, + "avg_penalty/before_target": 0.31727730855345726, + "avg_penalty/before_think": 0.593233123421669, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 596.0, + "completions/max_terminated_length": 596.0, + "completions/mean_length": 209.875, + "completions/mean_terminated_length": 209.875, + "completions/min_length": 38.75, + "completions/min_terminated_length": 38.75, + "epoch": 0.7385, + "grad_norm": 6.356523036956787, + "kl": 25.875, + "learning_rate": 3.898548360987325e-06, + "loss": 2.0126, + "num_tokens": 43572450.0, + "reward": 1.4921875, + "reward_std": 0.826792910695076, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.43708496540784836, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.40664908289909363, + "step": 1477, + "token_counts/after_target": 467.0, + "token_counts/after_think": 63.25, + "token_counts/before_target": 1819.25, + "token_counts/before_think": 1008.5 + }, + { + "avg_penalty/after_target": 3.2363668084144592, + "avg_penalty/after_think": 2.995323359966278, + "avg_penalty/before_target": 0.4092422276735306, + "avg_penalty/before_think": 0.41175302118062973, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 637.0, + "completions/max_terminated_length": 500.25, + "completions/mean_length": 192.265625, + "completions/mean_terminated_length": 179.3208351135254, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.739, + "grad_norm": 12.504798889160156, + "kl": 12.953125, + "learning_rate": 3.88472959814169e-06, + "loss": 1.5021, + "num_tokens": 43595603.0, + "reward": 1.68359375, + "reward_std": 0.6767023205757141, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3943893313407898, + "rewards/tag_count_reward/mean": 0.87109375, + "rewards/tag_count_reward/std": 0.3077758029103279, + "step": 1478, + "token_counts/after_target": 561.5, + "token_counts/after_think": 107.5, + "token_counts/before_target": 1340.5, + "token_counts/before_think": 1066.75 + }, + { + "avg_penalty/after_target": 1.7830774784088135, + "avg_penalty/after_think": 1.6863023042678833, + "avg_penalty/before_target": 0.30585886910557747, + "avg_penalty/before_think": 0.29606402665376663, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 641.25, + "completions/max_terminated_length": 641.25, + "completions/mean_length": 195.46875, + "completions/mean_terminated_length": 195.46875, + "completions/min_length": 25.5, + "completions/min_terminated_length": 25.5, + "epoch": 0.7395, + "grad_norm": 5.344656467437744, + "kl": 16.890625, + "learning_rate": 3.8709294634702374e-06, + "loss": 1.2972, + "num_tokens": 43617457.0, + "reward": 1.578125, + "reward_std": 0.824585348367691, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.42516325414180756, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.4019997715950012, + "step": 1479, + "token_counts/after_target": 303.25, + "token_counts/after_think": 13.0, + "token_counts/before_target": 1629.0, + "token_counts/before_think": 1182.25 + }, + { + "avg_penalty/after_target": 2.9606299996376038, + "avg_penalty/after_think": 3.321190983057022, + "avg_penalty/before_target": 0.3875853233039379, + "avg_penalty/before_think": 0.34735309332609177, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 723.75, + "completions/max_terminated_length": 657.0, + "completions/mean_length": 232.953125, + "completions/mean_terminated_length": 221.86146545410156, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.74, + "grad_norm": 8.177590370178223, + "kl": 31.09375, + "learning_rate": 3.857147999010568e-06, + "loss": 2.4293, + "num_tokens": 43641150.0, + "reward": 1.33203125, + "reward_std": 0.9052022397518158, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.46822190284729004, + "rewards/tag_count_reward/mean": 0.67578125, + "rewards/tag_count_reward/std": 0.441455215215683, + "step": 1480, + "token_counts/after_target": 986.0, + "token_counts/after_think": 26.25, + "token_counts/before_target": 1931.5, + "token_counts/before_think": 783.5 + }, + { + "avg_penalty/after_target": 2.3105635344982147, + "avg_penalty/after_think": 2.5454810857772827, + "avg_penalty/before_target": 0.515326052904129, + "avg_penalty/before_think": 0.42433543130755424, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.25, + "completions/max_terminated_length": 492.25, + "completions/mean_length": 153.921875, + "completions/mean_terminated_length": 153.921875, + "completions/min_length": 31.75, + "completions/min_terminated_length": 31.75, + "epoch": 0.7405, + "grad_norm": 3.714505195617676, + "kl": 22.046875, + "learning_rate": 3.8433852467434175e-06, + "loss": 1.8606, + "num_tokens": 43661017.0, + "reward": 1.55078125, + "reward_std": 0.8126392066478729, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.40316852182149887, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.398878812789917, + "step": 1481, + "token_counts/after_target": 395.25, + "token_counts/after_think": 36.25, + "token_counts/before_target": 1163.75, + "token_counts/before_think": 867.5 + }, + { + "avg_penalty/after_target": 2.264179229736328, + "avg_penalty/after_think": 3.784268856048584, + "avg_penalty/before_target": 0.5633988231420517, + "avg_penalty/before_think": 0.5812869444489479, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 780.5, + "completions/max_terminated_length": 766.0, + "completions/mean_length": 248.5, + "completions/mean_terminated_length": 237.67188262939453, + "completions/min_length": 52.25, + "completions/min_terminated_length": 52.25, + "epoch": 0.741, + "grad_norm": 9.864948272705078, + "kl": 22.09375, + "learning_rate": 3.829641248592515e-06, + "loss": 2.2307, + "num_tokens": 43691289.0, + "reward": 1.6328125, + "reward_std": 0.7476237565279007, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4097762927412987, + "rewards/tag_count_reward/mean": 0.8359375, + "rewards/tag_count_reward/std": 0.3511649966239929, + "step": 1482, + "token_counts/after_target": 1120.25, + "token_counts/after_think": 55.0, + "token_counts/before_target": 1804.5, + "token_counts/before_think": 996.25 + }, + { + "avg_penalty/after_target": 2.8617839217185974, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.41413508728146553, + "avg_penalty/before_think": 0.3949640281498432, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 654.0, + "completions/max_terminated_length": 654.0, + "completions/mean_length": 234.125, + "completions/mean_terminated_length": 234.125, + "completions/min_length": 60.25, + "completions/min_terminated_length": 60.25, + "epoch": 0.7415, + "grad_norm": 6.34937047958374, + "kl": 23.71875, + "learning_rate": 3.81591604642446e-06, + "loss": 2.1474, + "num_tokens": 43716801.0, + "reward": 1.53125, + "reward_std": 0.8187496066093445, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44938503205776215, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.3837965428829193, + "step": 1483, + "token_counts/after_target": 797.5, + "token_counts/after_think": 120.25, + "token_counts/before_target": 1993.75, + "token_counts/before_think": 834.5 + }, + { + "avg_penalty/after_target": 2.713574916124344, + "avg_penalty/after_think": 0.8677309155464172, + "avg_penalty/before_target": 0.33670778200030327, + "avg_penalty/before_think": 0.5319893285632133, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.75, + "completions/max_terminated_length": 580.75, + "completions/mean_length": 230.09375, + "completions/mean_terminated_length": 230.09375, + "completions/min_length": 44.75, + "completions/min_terminated_length": 44.75, + "epoch": 0.742, + "grad_norm": 2.977111339569092, + "kl": 20.03125, + "learning_rate": 3.8022096820486023e-06, + "loss": 1.6699, + "num_tokens": 43740199.0, + "reward": 1.4921875, + "reward_std": 0.7910887748003006, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.45028156042099, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.37273015826940536, + "step": 1484, + "token_counts/after_target": 697.25, + "token_counts/after_think": 67.5, + "token_counts/before_target": 1592.25, + "token_counts/before_think": 1324.5 + }, + { + "avg_penalty/after_target": 2.2620757818222046, + "avg_penalty/after_think": 2.787731647491455, + "avg_penalty/before_target": 0.5447104647755623, + "avg_penalty/before_think": 0.37498967722058296, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 737.5, + "completions/max_terminated_length": 737.5, + "completions/mean_length": 187.6875, + "completions/mean_terminated_length": 187.6875, + "completions/min_length": 34.25, + "completions/min_terminated_length": 34.25, + "epoch": 0.7425, + "grad_norm": 8.522459983825684, + "kl": 24.265625, + "learning_rate": 3.7885221972168974e-06, + "loss": 2.3136, + "num_tokens": 43764499.0, + "reward": 1.5546875, + "reward_std": 0.7924706637859344, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.41503459960222244, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.38851017504930496, + "step": 1485, + "token_counts/after_target": 801.5, + "token_counts/after_think": 143.0, + "token_counts/before_target": 1285.0, + "token_counts/before_think": 773.5 + }, + { + "avg_penalty/after_target": 2.3773683607578278, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5915956199169159, + "avg_penalty/before_think": 0.5332842692732811, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 722.25, + "completions/max_terminated_length": 689.5, + "completions/mean_length": 256.328125, + "completions/mean_terminated_length": 244.8625030517578, + "completions/min_length": 49.5, + "completions/min_terminated_length": 49.5, + "epoch": 0.743, + "grad_norm": 12.244877815246582, + "kl": 20.265625, + "learning_rate": 3.774853633623806e-06, + "loss": 2.1489, + "num_tokens": 43793880.0, + "reward": 1.640625, + "reward_std": 0.6959435939788818, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4022643193602562, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.3020078707486391, + "step": 1486, + "token_counts/after_target": 1073.0, + "token_counts/after_think": 181.25, + "token_counts/before_target": 1769.5, + "token_counts/before_think": 1077.5 + }, + { + "avg_penalty/after_target": 2.578528717160225, + "avg_penalty/after_think": 1.7722386717796326, + "avg_penalty/before_target": 0.3645913377404213, + "avg_penalty/before_think": 0.41965460777282715, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 165.953125, + "completions/mean_terminated_length": 165.953125, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.7435, + "grad_norm": 4.96734619140625, + "kl": 23.21875, + "learning_rate": 3.7612040329061405e-06, + "loss": 1.8354, + "num_tokens": 43816005.0, + "reward": 1.53125, + "reward_std": 0.8213532716035843, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4519384130835533, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.39865589886903763, + "step": 1487, + "token_counts/after_target": 490.25, + "token_counts/after_think": 17.25, + "token_counts/before_target": 1357.5, + "token_counts/before_think": 790.25 + }, + { + "avg_penalty/after_target": 2.7620414197444916, + "avg_penalty/after_think": 3.1248490810394287, + "avg_penalty/before_target": 0.383707620203495, + "avg_penalty/before_think": 0.4365735463798046, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 154.21875, + "completions/mean_terminated_length": 154.21875, + "completions/min_length": 35.5, + "completions/min_terminated_length": 35.5, + "epoch": 0.744, + "grad_norm": 3.102196216583252, + "kl": 22.515625, + "learning_rate": 3.747573436642952e-06, + "loss": 1.8849, + "num_tokens": 43836035.0, + "reward": 1.546875, + "reward_std": 0.8151131719350815, + "rewards/accuracy_reward/mean": NaN, + "rewards/accuracy_reward/std": NaN, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42867646366357803, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.4041426107287407, + "step": 1488, + "token_counts/after_target": 397.25, + "token_counts/after_think": 7.75, + "token_counts/before_target": 1273.25, + "token_counts/before_think": 789.25 + }, + { + "avg_penalty/after_target": 2.4802702367305756, + "avg_penalty/after_think": 2.592751681804657, + "avg_penalty/before_target": 0.1942821852862835, + "avg_penalty/before_think": 0.46051526814699173, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.25, + "completions/max_terminated_length": 472.25, + "completions/mean_length": 141.03125, + "completions/mean_terminated_length": 141.03125, + "completions/min_length": 34.5, + "completions/min_terminated_length": 34.5, + "epoch": 0.7445, + "grad_norm": 3.3651046752929688, + "kl": 18.625, + "learning_rate": 3.7339618863553983e-06, + "loss": 1.5863, + "num_tokens": 43853077.0, + "reward": 1.671875, + "reward_std": 0.728599026799202, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38336414843797684, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.3515702039003372, + "step": 1489, + "token_counts/after_target": 219.25, + "token_counts/after_think": 52.25, + "token_counts/before_target": 1203.0, + "token_counts/before_think": 782.0 + }, + { + "avg_penalty/after_target": 2.360570400953293, + "avg_penalty/after_think": 2.734126567840576, + "avg_penalty/before_target": 0.47465042769908905, + "avg_penalty/before_think": 0.3718792200088501, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.25, + "completions/max_terminated_length": 600.25, + "completions/mean_length": 179.71875, + "completions/mean_terminated_length": 179.71875, + "completions/min_length": 32.25, + "completions/min_terminated_length": 32.25, + "epoch": 0.745, + "grad_norm": 6.166507244110107, + "kl": 18.515625, + "learning_rate": 3.7203694235066224e-06, + "loss": 1.749, + "num_tokens": 43874803.0, + "reward": 1.5546875, + "reward_std": 0.7801915407180786, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4079566150903702, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.3781409040093422, + "step": 1490, + "token_counts/after_target": 644.0, + "token_counts/after_think": 64.75, + "token_counts/before_target": 1146.5, + "token_counts/before_think": 1020.25 + }, + { + "avg_penalty/after_target": 2.622638165950775, + "avg_penalty/after_think": 2.621209740638733, + "avg_penalty/before_target": 0.5311332754790783, + "avg_penalty/before_think": 0.3403617963194847, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 609.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 166.8125, + "completions/mean_terminated_length": 154.41354370117188, + "completions/min_length": 36.75, + "completions/min_terminated_length": 36.75, + "epoch": 0.7455, + "grad_norm": 2.9950225353240967, + "kl": 26.6875, + "learning_rate": 3.7067960895016277e-06, + "loss": 2.3018, + "num_tokens": 43895079.0, + "reward": 1.55078125, + "reward_std": 0.7542260438203812, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.43616948276758194, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.34143708646297455, + "step": 1491, + "token_counts/after_target": 744.5, + "token_counts/after_think": 34.75, + "token_counts/before_target": 1290.25, + "token_counts/before_think": 599.5 + }, + { + "avg_penalty/after_target": 1.8900682926177979, + "avg_penalty/after_think": 3.864628314971924, + "avg_penalty/before_target": 0.4619595631957054, + "avg_penalty/before_think": 0.4235173538327217, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.5, + "completions/max_terminated_length": 562.5, + "completions/mean_length": 180.65625, + "completions/mean_terminated_length": 180.65625, + "completions/min_length": 31.5, + "completions/min_terminated_length": 31.5, + "epoch": 0.746, + "grad_norm": 4.0718231201171875, + "kl": 21.125, + "learning_rate": 3.693241925687141e-06, + "loss": 1.9511, + "num_tokens": 43917233.0, + "reward": 1.640625, + "reward_std": 0.7286457419395447, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4022643193602562, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.34512875974178314, + "step": 1492, + "token_counts/after_target": 577.75, + "token_counts/after_think": 57.0, + "token_counts/before_target": 1344.5, + "token_counts/before_think": 911.25 + }, + { + "avg_penalty/after_target": 3.054910898208618, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.45474841073155403, + "avg_penalty/before_think": 0.43875088915228844, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.75, + "completions/max_terminated_length": 687.75, + "completions/mean_length": 230.140625, + "completions/mean_terminated_length": 230.140625, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.7465, + "grad_norm": 3.734433889389038, + "kl": 22.5625, + "learning_rate": 3.679706973351491e-06, + "loss": 1.9659, + "num_tokens": 43941434.0, + "reward": 1.45703125, + "reward_std": 0.8180265426635742, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.42739029973745346, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.4133409336209297, + "step": 1493, + "token_counts/after_target": 753.0, + "token_counts/after_think": 94.75, + "token_counts/before_target": 1838.75, + "token_counts/before_think": 995.75 + }, + { + "avg_penalty/after_target": 2.1667374074459076, + "avg_penalty/after_think": 3.0558109283447266, + "avg_penalty/before_target": 0.28270431235432625, + "avg_penalty/before_think": 0.4465213418006897, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.5, + "completions/max_terminated_length": 535.5, + "completions/mean_length": 165.640625, + "completions/mean_terminated_length": 165.640625, + "completions/min_length": 49.75, + "completions/min_terminated_length": 49.75, + "epoch": 0.747, + "grad_norm": 2.7978787422180176, + "kl": 18.4375, + "learning_rate": 3.6661912737244996e-06, + "loss": 1.6604, + "num_tokens": 43961475.0, + "reward": 1.7421875, + "reward_std": 0.6313086301088333, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3450859263539314, + "rewards/tag_count_reward/mean": 0.8828125, + "rewards/tag_count_reward/std": 0.28959690779447556, + "step": 1494, + "token_counts/after_target": 259.5, + "token_counts/after_think": 28.75, + "token_counts/before_target": 1679.25, + "token_counts/before_think": 682.75 + }, + { + "avg_penalty/after_target": 1.68214812874794, + "avg_penalty/after_think": 3.8555734753608704, + "avg_penalty/before_target": 0.4951085075736046, + "avg_penalty/before_think": 0.525594562292099, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 742.5, + "completions/max_terminated_length": 594.25, + "completions/mean_length": 186.40625, + "completions/mean_terminated_length": 172.5250015258789, + "completions/min_length": 47.75, + "completions/min_terminated_length": 47.75, + "epoch": 0.7475, + "grad_norm": 6.468764305114746, + "kl": 25.34375, + "learning_rate": 3.6526948679773256e-06, + "loss": 2.0769, + "num_tokens": 43983661.0, + "reward": 1.55078125, + "reward_std": 0.833182081580162, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.43303824216127396, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.4064244106411934, + "step": 1495, + "token_counts/after_target": 590.25, + "token_counts/after_think": 40.75, + "token_counts/before_target": 1597.25, + "token_counts/before_think": 754.25 + }, + { + "avg_penalty/after_target": 2.4297573268413544, + "avg_penalty/after_think": 2.4864725470542908, + "avg_penalty/before_target": 0.31706585362553596, + "avg_penalty/before_think": 0.4126063883304596, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.5, + "completions/max_terminated_length": 454.5, + "completions/mean_length": 172.609375, + "completions/mean_terminated_length": 172.609375, + "completions/min_length": 41.25, + "completions/min_terminated_length": 41.25, + "epoch": 0.748, + "grad_norm": 5.392134189605713, + "kl": 17.8125, + "learning_rate": 3.6392177972223596e-06, + "loss": 1.4246, + "num_tokens": 44005172.0, + "reward": 1.625, + "reward_std": 0.7682497203350067, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4141380712389946, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.36396414786577225, + "step": 1496, + "token_counts/after_target": 436.5, + "token_counts/after_think": 44.5, + "token_counts/before_target": 1189.75, + "token_counts/before_think": 1091.0 + }, + { + "avg_penalty/after_target": 3.088262975215912, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.2836729660630226, + "avg_penalty/before_think": 0.4343739300966263, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.5, + "completions/max_terminated_length": 463.5, + "completions/mean_length": 179.859375, + "completions/mean_terminated_length": 179.859375, + "completions/min_length": 58.75, + "completions/min_terminated_length": 58.75, + "epoch": 0.7485, + "grad_norm": 8.608089447021484, + "kl": 14.8671875, + "learning_rate": 3.625760102513103e-06, + "loss": 1.6785, + "num_tokens": 44024763.0, + "reward": 1.73828125, + "reward_std": 0.6828649044036865, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.125, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.3683478757739067, + "rewards/tag_count_reward/mean": 0.87890625, + "rewards/tag_count_reward/std": 0.30074620991945267, + "step": 1497, + "token_counts/after_target": 607.5, + "token_counts/after_think": 78.5, + "token_counts/before_target": 1234.0, + "token_counts/before_think": 957.75 + }, + { + "avg_penalty/after_target": 2.7116947174072266, + "avg_penalty/after_think": 3.688149571418762, + "avg_penalty/before_target": 0.23886560276150703, + "avg_penalty/before_think": 0.41318731009960175, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 154.734375, + "completions/mean_terminated_length": 154.734375, + "completions/min_length": 36.25, + "completions/min_terminated_length": 36.25, + "epoch": 0.749, + "grad_norm": 3.4770822525024414, + "kl": 13.833984375, + "learning_rate": 3.612321824844024e-06, + "loss": 1.3167, + "num_tokens": 44043706.0, + "reward": 1.77734375, + "reward_std": 0.5669179856777191, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.28694770485162735, + "rewards/tag_count_reward/mean": 0.88671875, + "rewards/tag_count_reward/std": 0.2651234194636345, + "step": 1498, + "token_counts/after_target": 358.25, + "token_counts/after_think": 25.5, + "token_counts/before_target": 1015.5, + "token_counts/before_think": 1076.5 + }, + { + "avg_penalty/after_target": 2.300789475440979, + "avg_penalty/after_think": 2.3029625415802, + "avg_penalty/before_target": 0.3296910710632801, + "avg_penalty/before_think": 0.6279953569173813, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.25, + "completions/max_terminated_length": 646.25, + "completions/mean_length": 183.96875, + "completions/mean_terminated_length": 183.96875, + "completions/min_length": 54.5, + "completions/min_terminated_length": 54.5, + "epoch": 0.7495, + "grad_norm": 9.754026412963867, + "kl": 23.71875, + "learning_rate": 3.598903005150444e-06, + "loss": 1.8913, + "num_tokens": 44064344.0, + "reward": 1.625, + "reward_std": 0.7204088419675827, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.3890564441680908, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.34389182180166245, + "step": 1499, + "token_counts/after_target": 457.25, + "token_counts/after_think": 16.25, + "token_counts/before_target": 1667.0, + "token_counts/before_think": 803.0 + }, + { + "avg_penalty/after_target": 2.3929420113563538, + "avg_penalty/after_think": 2.9279523491859436, + "avg_penalty/before_target": 0.3810619153082371, + "avg_penalty/before_think": 0.4309021830558777, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.25, + "completions/max_terminated_length": 695.25, + "completions/mean_length": 203.875, + "completions/mean_terminated_length": 203.875, + "completions/min_length": 43.5, + "completions/min_terminated_length": 43.5, + "epoch": 0.75, + "grad_norm": 10.212488174438477, + "kl": 25.25, + "learning_rate": 3.5855036843084213e-06, + "loss": 1.8778, + "num_tokens": 44088288.0, + "reward": 1.53515625, + "reward_std": 0.7929867506027222, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4260597825050354, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.3824653699994087, + "step": 1500, + "token_counts/after_target": 535.5, + "token_counts/after_think": 18.0, + "token_counts/before_target": 1713.5, + "token_counts/before_think": 995.0 + }, + { + "avg_penalty/after_target": 2.690346986055374, + "avg_penalty/after_think": 2.7985397577285767, + "avg_penalty/before_target": 0.33413802087306976, + "avg_penalty/before_think": 0.3784530274569988, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.5, + "completions/max_terminated_length": 601.5, + "completions/mean_length": 205.71875, + "completions/mean_terminated_length": 205.71875, + "completions/min_length": 36.5, + "completions/min_terminated_length": 36.5, + "epoch": 0.7505, + "grad_norm": 9.325592041015625, + "kl": 28.71875, + "learning_rate": 3.5721239031346067e-06, + "loss": 2.1919, + "num_tokens": 44111198.0, + "reward": 1.4921875, + "reward_std": 0.8595259934663773, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44187305867671967, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.4260246232151985, + "step": 1501, + "token_counts/after_target": 571.5, + "token_counts/after_think": 50.5, + "token_counts/before_target": 2024.0, + "token_counts/before_think": 645.5 + }, + { + "avg_penalty/after_target": 2.097891330718994, + "avg_penalty/after_think": 2.625620722770691, + "avg_penalty/before_target": 0.5506287962198257, + "avg_penalty/before_think": 0.36434245109558105, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.5, + "completions/max_terminated_length": 646.5, + "completions/mean_length": 209.5625, + "completions/mean_terminated_length": 209.5625, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.751, + "grad_norm": 3.721883773803711, + "kl": 30.46875, + "learning_rate": 3.5587637023861356e-06, + "loss": 2.6177, + "num_tokens": 44135874.0, + "reward": 1.640625, + "reward_std": 0.742177277803421, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.3283620551228523, + "step": 1502, + "token_counts/after_target": 722.5, + "token_counts/after_think": 129.75, + "token_counts/before_target": 1713.25, + "token_counts/before_think": 787.5 + }, + { + "avg_penalty/after_target": 2.5382310152053833, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3023683540523052, + "avg_penalty/before_think": 0.26937834173440933, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.75, + "completions/max_terminated_length": 474.75, + "completions/mean_length": 157.890625, + "completions/mean_terminated_length": 157.890625, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.7515, + "grad_norm": 5.456370830535889, + "kl": 21.90625, + "learning_rate": 3.545423122760493e-06, + "loss": 1.7134, + "num_tokens": 44157515.0, + "reward": 1.66015625, + "reward_std": 0.8084198385477066, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.42516325414180756, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.35111451894044876, + "step": 1503, + "token_counts/after_target": 253.0, + "token_counts/after_think": 88.5, + "token_counts/before_target": 1466.75, + "token_counts/before_think": 718.0 + }, + { + "avg_penalty/after_target": 2.3844062983989716, + "avg_penalty/after_think": 3.626218557357788, + "avg_penalty/before_target": 0.3440960571169853, + "avg_penalty/before_think": 0.41580329090356827, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 121.640625, + "completions/mean_terminated_length": 121.640625, + "completions/min_length": 48.75, + "completions/min_terminated_length": 48.75, + "epoch": 0.752, + "grad_norm": 3.83609676361084, + "kl": 6.8828125, + "learning_rate": 3.5321022048954036e-06, + "loss": 0.8386, + "num_tokens": 44174820.0, + "reward": 1.84375, + "reward_std": 0.46656501293182373, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.23328252136707306, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.23328252136707306, + "step": 1504, + "token_counts/after_target": 204.0, + "token_counts/after_think": 31.5, + "token_counts/before_target": 936.0, + "token_counts/before_think": 774.75 + }, + { + "avg_penalty/after_target": 1.7066277861595154, + "avg_penalty/after_think": 2.775070011615753, + "avg_penalty/before_target": 0.39913105219602585, + "avg_penalty/before_think": 0.5112730637192726, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.25, + "completions/max_terminated_length": 515.25, + "completions/mean_length": 185.59375, + "completions/mean_terminated_length": 185.59375, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.7525, + "grad_norm": 6.255773067474365, + "kl": 21.78125, + "learning_rate": 3.5188009893686916e-06, + "loss": 1.7058, + "num_tokens": 44197242.0, + "reward": 1.6171875, + "reward_std": 0.7405305057764053, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.41194770485162735, + "rewards/tag_count_reward/mean": 0.8359375, + "rewards/tag_count_reward/std": 0.3419371247291565, + "step": 1505, + "token_counts/after_target": 391.75, + "token_counts/after_think": 16.25, + "token_counts/before_target": 1665.5, + "token_counts/before_think": 896.0 + }, + { + "avg_penalty/after_target": 2.0120323300361633, + "avg_penalty/after_think": 3.595200538635254, + "avg_penalty/before_target": 0.37489231675863266, + "avg_penalty/before_think": 0.5237308070063591, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.75, + "completions/max_terminated_length": 524.75, + "completions/mean_length": 152.0, + "completions/mean_terminated_length": 152.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.753, + "grad_norm": 2.9945220947265625, + "kl": 17.2734375, + "learning_rate": 3.505519516698165e-06, + "loss": 1.4802, + "num_tokens": 44214906.0, + "reward": 1.65234375, + "reward_std": 0.7190857231616974, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3811737820506096, + "rewards/tag_count_reward/mean": 0.83984375, + "rewards/tag_count_reward/std": 0.3486415222287178, + "step": 1506, + "token_counts/after_target": 501.25, + "token_counts/after_think": 74.25, + "token_counts/before_target": 1172.75, + "token_counts/before_think": 683.75 + }, + { + "avg_penalty/after_target": 2.509931206703186, + "avg_penalty/after_think": 1.60618257522583, + "avg_penalty/before_target": 0.26298850029706955, + "avg_penalty/before_think": 0.32944686710834503, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.75, + "completions/max_terminated_length": 462.75, + "completions/mean_length": 154.96875, + "completions/mean_terminated_length": 154.96875, + "completions/min_length": 40.5, + "completions/min_terminated_length": 40.5, + "epoch": 0.7535, + "grad_norm": 3.407308578491211, + "kl": 18.75, + "learning_rate": 3.492257827341492e-06, + "loss": 1.5325, + "num_tokens": 44235400.0, + "reward": 1.49609375, + "reward_std": 0.8643490523099899, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.45508860796689987, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.41308456659317017, + "step": 1507, + "token_counts/after_target": 276.25, + "token_counts/after_think": 32.5, + "token_counts/before_target": 1272.75, + "token_counts/before_think": 898.0 + }, + { + "avg_penalty/after_target": 2.744671732187271, + "avg_penalty/after_think": 0.8188217878341675, + "avg_penalty/before_target": 0.6769682914018631, + "avg_penalty/before_think": 0.36607636511325836, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 609.0, + "completions/max_terminated_length": 466.25, + "completions/mean_length": 161.171875, + "completions/mean_terminated_length": 146.28020858764648, + "completions/min_length": 34.25, + "completions/min_terminated_length": 34.25, + "epoch": 0.754, + "grad_norm": 8.323038101196289, + "kl": 32.78125, + "learning_rate": 3.479015961696077e-06, + "loss": 2.9494, + "num_tokens": 44254803.0, + "reward": 1.5, + "reward_std": 0.8304832279682159, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.38965848088264465, + "step": 1508, + "token_counts/after_target": 775.0, + "token_counts/after_think": 1.75, + "token_counts/before_target": 1219.5, + "token_counts/before_think": 582.5 + }, + { + "avg_penalty/after_target": 2.0981967449188232, + "avg_penalty/after_think": 3.668205142021179, + "avg_penalty/before_target": 0.22463861107826233, + "avg_penalty/before_think": 0.496454693377018, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.75, + "completions/max_terminated_length": 517.75, + "completions/mean_length": 192.765625, + "completions/mean_terminated_length": 192.765625, + "completions/min_length": 56.25, + "completions/min_terminated_length": 56.25, + "epoch": 0.7545, + "grad_norm": 6.066761016845703, + "kl": 19.25, + "learning_rate": 3.4657939600989453e-06, + "loss": 1.5079, + "num_tokens": 44276292.0, + "reward": 1.61328125, + "reward_std": 0.7433078438043594, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.34242168813943863, + "step": 1509, + "token_counts/after_target": 244.5, + "token_counts/after_think": 57.75, + "token_counts/before_target": 1620.5, + "token_counts/before_think": 1161.5 + }, + { + "avg_penalty/after_target": 2.2777462899684906, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.24609708786010742, + "avg_penalty/before_think": 0.35598525404930115, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.5, + "completions/max_terminated_length": 452.5, + "completions/mean_length": 158.1875, + "completions/mean_terminated_length": 158.1875, + "completions/min_length": 34.75, + "completions/min_terminated_length": 34.75, + "epoch": 0.755, + "grad_norm": 4.804813385009766, + "kl": 22.75, + "learning_rate": 3.452591862826603e-06, + "loss": 1.7832, + "num_tokens": 44296736.0, + "reward": 1.609375, + "reward_std": 0.8217267841100693, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.38390107452869415, + "step": 1510, + "token_counts/after_target": 210.75, + "token_counts/after_think": 148.75, + "token_counts/before_target": 1330.25, + "token_counts/before_think": 841.25 + }, + { + "avg_penalty/after_target": 2.028011441230774, + "avg_penalty/after_think": 3.7011146545410156, + "avg_penalty/before_target": 0.6369874551892281, + "avg_penalty/before_think": 0.45528895407915115, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 718.75, + "completions/max_terminated_length": 718.75, + "completions/mean_length": 178.390625, + "completions/mean_terminated_length": 178.390625, + "completions/min_length": 37.75, + "completions/min_terminated_length": 37.75, + "epoch": 0.7555, + "grad_norm": 7.011771202087402, + "kl": 30.1875, + "learning_rate": 3.4394097100949286e-06, + "loss": 2.7284, + "num_tokens": 44317577.0, + "reward": 1.56640625, + "reward_std": 0.780193641781807, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.41503459960222244, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.3735913336277008, + "step": 1511, + "token_counts/after_target": 814.75, + "token_counts/after_think": 51.75, + "token_counts/before_target": 1412.5, + "token_counts/before_think": 575.25 + }, + { + "avg_penalty/after_target": 2.5106109380722046, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4327758327126503, + "avg_penalty/before_think": 0.6927971392869949, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 597.0, + "completions/max_terminated_length": 597.0, + "completions/mean_length": 208.046875, + "completions/mean_terminated_length": 208.046875, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.756, + "grad_norm": 3.0813164710998535, + "kl": 21.21875, + "learning_rate": 3.4262475420590414e-06, + "loss": 1.9032, + "num_tokens": 44339644.0, + "reward": 1.56640625, + "reward_std": 0.804341658949852, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42867646366357803, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.38951559364795685, + "step": 1512, + "token_counts/after_target": 664.25, + "token_counts/after_think": 99.25, + "token_counts/before_target": 1641.5, + "token_counts/before_think": 923.75 + }, + { + "avg_penalty/after_target": 1.960842490196228, + "avg_penalty/after_think": 3.443837583065033, + "avg_penalty/before_target": 0.4741654582321644, + "avg_penalty/before_think": 0.4337494820356369, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 610.25, + "completions/max_terminated_length": 610.25, + "completions/mean_length": 190.546875, + "completions/mean_terminated_length": 190.546875, + "completions/min_length": 37.75, + "completions/min_terminated_length": 37.75, + "epoch": 0.7565, + "grad_norm": 2.7840170860290527, + "kl": 22.953125, + "learning_rate": 3.4131053988131947e-06, + "loss": 1.9758, + "num_tokens": 44360319.0, + "reward": 1.65234375, + "reward_std": 0.6949131786823273, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.85546875, + "rewards/tag_count_reward/std": 0.30215744860470295, + "step": 1513, + "token_counts/after_target": 565.25, + "token_counts/after_think": 33.25, + "token_counts/before_target": 1736.0, + "token_counts/before_think": 714.25 + }, + { + "avg_penalty/after_target": 3.0526891350746155, + "avg_penalty/after_think": 2.8805035948753357, + "avg_penalty/before_target": 0.2467946745455265, + "avg_penalty/before_think": 0.3730822987854481, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.5, + "completions/max_terminated_length": 576.5, + "completions/mean_length": 201.875, + "completions/mean_terminated_length": 201.875, + "completions/min_length": 49.25, + "completions/min_terminated_length": 49.25, + "epoch": 0.757, + "grad_norm": 3.6817123889923096, + "kl": 17.46875, + "learning_rate": 3.399983320390633e-06, + "loss": 1.5145, + "num_tokens": 44387047.0, + "reward": 1.6328125, + "reward_std": 0.5909731388092041, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.3300696536898613, + "rewards/tag_count_reward/mean": 0.8359375, + "rewards/tag_count_reward/std": 0.27664652466773987, + "step": 1514, + "token_counts/after_target": 433.25, + "token_counts/after_think": 52.75, + "token_counts/before_target": 1880.5, + "token_counts/before_think": 863.5 + }, + { + "avg_penalty/after_target": 2.5364700853824615, + "avg_penalty/after_think": 2.3951989114284515, + "avg_penalty/before_target": 0.4872037321329117, + "avg_penalty/before_think": 0.3946199119091034, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 783.25, + "completions/max_terminated_length": 591.25, + "completions/mean_length": 231.90625, + "completions/mean_terminated_length": 218.01250076293945, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.7575, + "grad_norm": 2.7505483627319336, + "kl": 28.375, + "learning_rate": 3.3868813467634833e-06, + "loss": 2.4666, + "num_tokens": 44415713.0, + "reward": 1.53125, + "reward_std": 0.8176640272140503, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4308478757739067, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.39756059646606445, + "step": 1515, + "token_counts/after_target": 1020.5, + "token_counts/after_think": 71.75, + "token_counts/before_target": 1765.0, + "token_counts/before_think": 853.25 + }, + { + "avg_penalty/after_target": 3.1977943778038025, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.43002306669950485, + "avg_penalty/before_think": 0.4142632633447647, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 597.5, + "completions/max_terminated_length": 597.5, + "completions/mean_length": 203.84375, + "completions/mean_terminated_length": 203.84375, + "completions/min_length": 49.5, + "completions/min_terminated_length": 49.5, + "epoch": 0.758, + "grad_norm": 7.640476703643799, + "kl": 19.65625, + "learning_rate": 3.3737995178426276e-06, + "loss": 1.9357, + "num_tokens": 44439127.0, + "reward": 1.70703125, + "reward_std": 0.743480771780014, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3987511098384857, + "rewards/tag_count_reward/mean": 0.86328125, + "rewards/tag_count_reward/std": 0.3347749337553978, + "step": 1516, + "token_counts/after_target": 672.25, + "token_counts/after_think": 4.5, + "token_counts/before_target": 1803.25, + "token_counts/before_think": 781.5 + }, + { + "avg_penalty/after_target": 2.4859440326690674, + "avg_penalty/after_think": 2.8270366191864014, + "avg_penalty/before_target": 0.34864190220832825, + "avg_penalty/before_think": 0.452807180583477, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.0, + "completions/max_terminated_length": 551.0, + "completions/mean_length": 219.609375, + "completions/mean_terminated_length": 219.609375, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.7585, + "grad_norm": 2.614680290222168, + "kl": 18.078125, + "learning_rate": 3.360737873477584e-06, + "loss": 1.5284, + "num_tokens": 44466830.0, + "reward": 1.5859375, + "reward_std": 0.8061973303556442, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.39712292701005936, + "step": 1517, + "token_counts/after_target": 579.75, + "token_counts/after_think": 80.75, + "token_counts/before_target": 1907.25, + "token_counts/before_think": 946.0 + }, + { + "avg_penalty/after_target": 2.612481951713562, + "avg_penalty/after_think": 3.5946184396743774, + "avg_penalty/before_target": 0.40728629380464554, + "avg_penalty/before_think": 0.5152209773659706, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 666.75, + "completions/max_terminated_length": 666.75, + "completions/mean_length": 189.015625, + "completions/mean_terminated_length": 189.015625, + "completions/min_length": 38.75, + "completions/min_terminated_length": 38.75, + "epoch": 0.759, + "grad_norm": 4.114999294281006, + "kl": 24.890625, + "learning_rate": 3.3476964534563927e-06, + "loss": 2.2357, + "num_tokens": 44489455.0, + "reward": 1.6875, + "reward_std": 0.8370992094278336, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.11967839300632477, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.42516325414180756, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.3561921827495098, + "step": 1518, + "token_counts/after_target": 692.0, + "token_counts/after_think": 66.75, + "token_counts/before_target": 1782.0, + "token_counts/before_think": 483.5 + }, + { + "avg_penalty/after_target": 2.10923770070076, + "avg_penalty/after_think": 3.7856873273849487, + "avg_penalty/before_target": 0.4464401677250862, + "avg_penalty/before_think": 0.5552891865372658, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.75, + "completions/max_terminated_length": 591.75, + "completions/mean_length": 211.046875, + "completions/mean_terminated_length": 211.046875, + "completions/min_length": 23.5, + "completions/min_terminated_length": 23.5, + "epoch": 0.7595, + "grad_norm": 3.61323881149292, + "kl": 22.453125, + "learning_rate": 3.3346752975054763e-06, + "loss": 1.9835, + "num_tokens": 44514226.0, + "reward": 1.5625, + "reward_std": 0.8405773937702179, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4519384130835533, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.37420687079429626, + "step": 1519, + "token_counts/after_target": 624.5, + "token_counts/after_think": 55.0, + "token_counts/before_target": 1986.5, + "token_counts/before_think": 710.75 + }, + { + "avg_penalty/after_target": 2.3977255821228027, + "avg_penalty/after_think": 1.6757308840751648, + "avg_penalty/before_target": 0.5004541799426079, + "avg_penalty/before_think": 0.5387773215770721, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 804.5, + "completions/max_terminated_length": 804.5, + "completions/mean_length": 260.03125, + "completions/mean_terminated_length": 260.03125, + "completions/min_length": 46.5, + "completions/min_terminated_length": 46.5, + "epoch": 0.76, + "grad_norm": 4.038911819458008, + "kl": 29.0, + "learning_rate": 3.3216744452895356e-06, + "loss": 2.4091, + "num_tokens": 44544452.0, + "reward": 1.43359375, + "reward_std": 0.8315329402685165, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.43526528775691986, + "rewards/tag_count_reward/mean": 0.73046875, + "rewards/tag_count_reward/std": 0.4121905565261841, + "step": 1520, + "token_counts/after_target": 1167.0, + "token_counts/after_think": 72.5, + "token_counts/before_target": 2177.5, + "token_counts/before_think": 743.5 + }, + { + "avg_penalty/after_target": 2.73081374168396, + "avg_penalty/after_think": 3.41992324590683, + "avg_penalty/before_target": 0.4880693331360817, + "avg_penalty/before_think": 0.5169877707958221, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 623.25, + "completions/max_terminated_length": 623.25, + "completions/mean_length": 185.03125, + "completions/mean_terminated_length": 185.03125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.7605, + "grad_norm": 14.933706283569336, + "kl": 13.484375, + "learning_rate": 3.308693936411421e-06, + "loss": 1.6037, + "num_tokens": 44568470.0, + "reward": 1.71484375, + "reward_std": 0.6751224994659424, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36797718703746796, + "rewards/tag_count_reward/mean": 0.87109375, + "rewards/tag_count_reward/std": 0.3159344121813774, + "step": 1521, + "token_counts/after_target": 558.75, + "token_counts/after_think": 220.75, + "token_counts/before_target": 1141.75, + "token_counts/before_think": 1039.25 + }, + { + "avg_penalty/after_target": 1.8248826563358307, + "avg_penalty/after_think": 2.7278937697410583, + "avg_penalty/before_target": 0.37617791444063187, + "avg_penalty/before_think": 0.48795250803232193, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.75, + "completions/max_terminated_length": 479.75, + "completions/mean_length": 163.625, + "completions/mean_terminated_length": 163.625, + "completions/min_length": 38.5, + "completions/min_terminated_length": 38.5, + "epoch": 0.761, + "grad_norm": 3.296776294708252, + "kl": 18.96875, + "learning_rate": 3.2957338104120096e-06, + "loss": 1.6438, + "num_tokens": 44592910.0, + "reward": 1.6640625, + "reward_std": 0.757234737277031, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38772592693567276, + "rewards/tag_count_reward/mean": 0.8359375, + "rewards/tag_count_reward/std": 0.3704550787806511, + "step": 1522, + "token_counts/after_target": 347.5, + "token_counts/after_think": 66.25, + "token_counts/before_target": 1349.5, + "token_counts/before_think": 854.75 + }, + { + "avg_penalty/after_target": 3.046928882598877, + "avg_penalty/after_think": 2.860698163509369, + "avg_penalty/before_target": 0.27873430773615837, + "avg_penalty/before_think": 0.5067713931202888, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 180.421875, + "completions/mean_terminated_length": 180.421875, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.7615, + "grad_norm": 3.2015769481658936, + "kl": 20.3125, + "learning_rate": 3.2827941067700996e-06, + "loss": 1.726, + "num_tokens": 44612169.0, + "reward": 1.5625, + "reward_std": 0.788553386926651, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4154609143733978, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.37686236947774887, + "step": 1523, + "token_counts/after_target": 465.25, + "token_counts/after_think": 85.5, + "token_counts/before_target": 1561.25, + "token_counts/before_think": 774.75 + }, + { + "avg_penalty/after_target": 1.9639931321144104, + "avg_penalty/after_think": 3.9790979623794556, + "avg_penalty/before_target": 0.29779988154768944, + "avg_penalty/before_think": 0.5679555758833885, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.25, + "completions/max_terminated_length": 506.25, + "completions/mean_length": 141.0, + "completions/mean_terminated_length": 141.0, + "completions/min_length": 38.75, + "completions/min_terminated_length": 38.75, + "epoch": 0.762, + "grad_norm": 5.560370922088623, + "kl": 24.328125, + "learning_rate": 3.2698748649022693e-06, + "loss": 1.954, + "num_tokens": 44630121.0, + "reward": 1.58203125, + "reward_std": 0.791913166642189, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.41110680997371674, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.371904693543911, + "step": 1524, + "token_counts/after_target": 284.0, + "token_counts/after_think": 70.75, + "token_counts/before_target": 1233.0, + "token_counts/before_think": 668.25 + }, + { + "avg_penalty/after_target": 2.276720941066742, + "avg_penalty/after_think": 2.964542269706726, + "avg_penalty/before_target": 0.5006067380309105, + "avg_penalty/before_think": 0.3956594616174698, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 587.5, + "completions/max_terminated_length": 554.75, + "completions/mean_length": 220.015625, + "completions/mean_terminated_length": 209.12187957763672, + "completions/min_length": 22.75, + "completions/min_terminated_length": 22.75, + "epoch": 0.7625, + "grad_norm": 2.8621411323547363, + "kl": 24.75, + "learning_rate": 3.2569761241627694e-06, + "loss": 2.1506, + "num_tokens": 44655146.0, + "reward": 1.484375, + "reward_std": 0.8399195373058319, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.3974381610751152, + "step": 1525, + "token_counts/after_target": 1004.0, + "token_counts/after_think": 41.0, + "token_counts/before_target": 1828.75, + "token_counts/before_think": 646.5 + }, + { + "avg_penalty/after_target": 2.8137325942516327, + "avg_penalty/after_think": 3.871985614299774, + "avg_penalty/before_target": 0.271632332354784, + "avg_penalty/before_think": 0.42631400376558304, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.0, + "completions/max_terminated_length": 656.0, + "completions/mean_length": 220.359375, + "completions/mean_terminated_length": 220.359375, + "completions/min_length": 52.75, + "completions/min_terminated_length": 52.75, + "epoch": 0.763, + "grad_norm": 5.481348991394043, + "kl": 19.15625, + "learning_rate": 3.2440979238433977e-06, + "loss": 1.513, + "num_tokens": 44680593.0, + "reward": 1.5546875, + "reward_std": 0.7945386320352554, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.43399807065725327, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.38053786754608154, + "step": 1526, + "token_counts/after_target": 505.0, + "token_counts/after_think": 51.5, + "token_counts/before_target": 2149.75, + "token_counts/before_think": 819.5 + }, + { + "avg_penalty/after_target": 1.6134191304445267, + "avg_penalty/after_think": 3.4814750850200653, + "avg_penalty/before_target": 0.3735000230371952, + "avg_penalty/before_think": 0.5039191320538521, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 560.75, + "completions/max_terminated_length": 560.75, + "completions/mean_length": 169.546875, + "completions/mean_terminated_length": 169.546875, + "completions/min_length": 43.5, + "completions/min_terminated_length": 43.5, + "epoch": 0.7635, + "grad_norm": 6.046893119812012, + "kl": 23.796875, + "learning_rate": 3.2312403031733943e-06, + "loss": 1.9306, + "num_tokens": 44700212.0, + "reward": 1.625, + "reward_std": 0.7171644568443298, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.3846946656703949, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.34150128066539764, + "step": 1527, + "token_counts/after_target": 298.5, + "token_counts/after_think": 42.5, + "token_counts/before_target": 1619.0, + "token_counts/before_think": 752.75 + }, + { + "avg_penalty/after_target": 2.5831914246082306, + "avg_penalty/after_think": 2.8202773928642273, + "avg_penalty/before_target": 0.3357250802218914, + "avg_penalty/before_think": 0.4264926537871361, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.25, + "completions/max_terminated_length": 502.25, + "completions/mean_length": 194.09375, + "completions/mean_terminated_length": 194.09375, + "completions/min_length": 39.75, + "completions/min_terminated_length": 39.75, + "epoch": 0.764, + "grad_norm": 3.10086727142334, + "kl": 16.640625, + "learning_rate": 3.2184033013192962e-06, + "loss": 1.5289, + "num_tokens": 44720890.0, + "reward": 1.6640625, + "reward_std": 0.7208899408578873, + "rewards/accuracy_reward/mean": NaN, + "rewards/accuracy_reward/std": NaN, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.3683478757739067, + "rewards/tag_count_reward/mean": 0.8359375, + "rewards/tag_count_reward/std": 0.3534688949584961, + "step": 1528, + "token_counts/after_target": 594.0, + "token_counts/after_think": 12.5, + "token_counts/before_target": 1924.25, + "token_counts/before_think": 574.75 + }, + { + "avg_penalty/after_target": 2.793103814125061, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3892641104757786, + "avg_penalty/before_think": 0.5444246381521225, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 197.15625, + "completions/mean_terminated_length": 197.15625, + "completions/min_length": 48.5, + "completions/min_terminated_length": 48.5, + "epoch": 0.7645, + "grad_norm": 3.0024712085723877, + "kl": 18.20703125, + "learning_rate": 3.2055869573848374e-06, + "loss": 1.5905, + "num_tokens": 44742644.0, + "reward": 1.55859375, + "reward_std": 0.7831601053476334, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.125, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.43708496540784836, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.36653827130794525, + "step": 1529, + "token_counts/after_target": 574.0, + "token_counts/after_think": 21.0, + "token_counts/before_target": 1767.5, + "token_counts/before_think": 792.0 + }, + { + "avg_penalty/after_target": 2.444180876016617, + "avg_penalty/after_think": 3.4790150225162506, + "avg_penalty/before_target": 0.4045490622520447, + "avg_penalty/before_think": 0.5966192111372948, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 639.75, + "completions/max_terminated_length": 513.0, + "completions/mean_length": 206.40625, + "completions/mean_terminated_length": 194.78646087646484, + "completions/min_length": 44.75, + "completions/min_terminated_length": 44.75, + "epoch": 0.765, + "grad_norm": 3.544707775115967, + "kl": 17.3671875, + "learning_rate": 3.192791310410822e-06, + "loss": 1.6149, + "num_tokens": 44766494.0, + "reward": 1.67578125, + "reward_std": 0.6827637553215027, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.37149807065725327, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.32106074690818787, + "step": 1530, + "token_counts/after_target": 685.0, + "token_counts/after_think": 26.25, + "token_counts/before_target": 1735.75, + "token_counts/before_think": 855.5 + }, + { + "avg_penalty/after_target": 3.098886013031006, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.391536608338356, + "avg_penalty/before_think": 0.47900596261024475, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 670.75, + "completions/max_terminated_length": 670.75, + "completions/mean_length": 247.0, + "completions/mean_terminated_length": 247.0, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.7655, + "grad_norm": 7.156198978424072, + "kl": 19.734375, + "learning_rate": 3.1800163993750166e-06, + "loss": 1.916, + "num_tokens": 44790014.0, + "reward": 1.56640625, + "reward_std": 0.7284420877695084, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4160471484065056, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.3270357772707939, + "step": 1531, + "token_counts/after_target": 1002.5, + "token_counts/after_think": 136.75, + "token_counts/before_target": 1460.75, + "token_counts/before_think": 1352.0 + }, + { + "avg_penalty/after_target": 2.3522475957870483, + "avg_penalty/after_think": 3.7916051745414734, + "avg_penalty/before_target": 0.3936493471264839, + "avg_penalty/before_think": 0.571266695857048, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 609.25, + "completions/max_terminated_length": 500.25, + "completions/mean_length": 202.703125, + "completions/mean_terminated_length": 190.23333740234375, + "completions/min_length": 39.75, + "completions/min_terminated_length": 39.75, + "epoch": 0.766, + "grad_norm": 4.380553722381592, + "kl": 14.21875, + "learning_rate": 3.1672622631920102e-06, + "loss": 1.3195, + "num_tokens": 44812107.0, + "reward": 1.55078125, + "reward_std": 0.8002783060073853, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4440634250640869, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.3715946599841118, + "step": 1532, + "token_counts/after_target": 685.75, + "token_counts/after_think": 142.0, + "token_counts/before_target": 1621.5, + "token_counts/before_think": 794.0 + }, + { + "avg_penalty/after_target": 2.047296106815338, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.37780410796403885, + "avg_penalty/before_think": 0.5118024796247482, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.25, + "completions/max_terminated_length": 614.25, + "completions/mean_length": 210.796875, + "completions/mean_terminated_length": 210.796875, + "completions/min_length": 44.25, + "completions/min_terminated_length": 44.25, + "epoch": 0.7665, + "grad_norm": 4.0590009689331055, + "kl": 11.953125, + "learning_rate": 3.1545289407131128e-06, + "loss": 1.1801, + "num_tokens": 44839262.0, + "reward": 1.79296875, + "reward_std": 0.5426802262663841, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.2979728877544403, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.22999096661806107, + "step": 1533, + "token_counts/after_target": 459.75, + "token_counts/after_think": 105.5, + "token_counts/before_target": 1642.25, + "token_counts/before_think": 1165.25 + }, + { + "avg_penalty/after_target": 2.6425327956676483, + "avg_penalty/after_think": 3.639721632003784, + "avg_penalty/before_target": 0.38279419392347336, + "avg_penalty/before_think": 0.67721176892519, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 662.75, + "completions/max_terminated_length": 662.75, + "completions/mean_length": 256.640625, + "completions/mean_terminated_length": 256.640625, + "completions/min_length": 34.25, + "completions/min_terminated_length": 34.25, + "epoch": 0.767, + "grad_norm": 2.899200201034546, + "kl": 16.1875, + "learning_rate": 3.1418164707262375e-06, + "loss": 1.5023, + "num_tokens": 44868439.0, + "reward": 1.6015625, + "reward_std": 0.6693937629461288, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.41104350984096527, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.3002399280667305, + "step": 1534, + "token_counts/after_target": 808.25, + "token_counts/after_think": 38.0, + "token_counts/before_target": 2237.25, + "token_counts/before_think": 1022.75 + }, + { + "avg_penalty/after_target": 2.5257005393505096, + "avg_penalty/after_think": 2.5184484124183655, + "avg_penalty/before_target": 0.6370499953627586, + "avg_penalty/before_think": 0.5901027172803879, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.5, + "completions/max_terminated_length": 757.5, + "completions/mean_length": 265.0, + "completions/mean_terminated_length": 265.0, + "completions/min_length": 40.75, + "completions/min_terminated_length": 40.75, + "epoch": 0.7675, + "grad_norm": 9.425206184387207, + "kl": 20.4375, + "learning_rate": 3.1291248919557717e-06, + "loss": 2.0341, + "num_tokens": 44894327.0, + "reward": 1.640625, + "reward_std": 0.6812386810779572, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4075859263539314, + "rewards/tag_count_reward/mean": 0.859375, + "rewards/tag_count_reward/std": 0.30425387993454933, + "step": 1535, + "token_counts/after_target": 1094.0, + "token_counts/after_think": 177.25, + "token_counts/before_target": 1746.5, + "token_counts/before_think": 1222.25 + }, + { + "avg_penalty/after_target": 2.6112672686576843, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.49052640050649643, + "avg_penalty/before_think": 0.6002802476286888, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 720.5, + "completions/max_terminated_length": 637.25, + "completions/mean_length": 247.140625, + "completions/mean_terminated_length": 235.1510467529297, + "completions/min_length": 49.25, + "completions/min_terminated_length": 49.25, + "epoch": 0.768, + "grad_norm": 3.8489573001861572, + "kl": 25.34375, + "learning_rate": 3.116454243062459e-06, + "loss": 2.1526, + "num_tokens": 44921808.0, + "reward": 1.48046875, + "reward_std": 0.8402712345123291, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.47360680997371674, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.3822649344801903, + "step": 1536, + "token_counts/after_target": 1056.75, + "token_counts/after_think": 31.75, + "token_counts/before_target": 2149.0, + "token_counts/before_think": 716.75 + }, + { + "avg_penalty/after_target": 2.1170877516269684, + "avg_penalty/after_think": 1.6463175415992737, + "avg_penalty/before_target": 0.28971071913838387, + "avg_penalty/before_think": 0.5784872621297836, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 621.5, + "completions/max_terminated_length": 621.5, + "completions/mean_length": 202.234375, + "completions/mean_terminated_length": 202.234375, + "completions/min_length": 48.25, + "completions/min_terminated_length": 48.25, + "epoch": 0.7685, + "grad_norm": 6.73053503036499, + "kl": 21.0, + "learning_rate": 3.103804562643302e-06, + "loss": 1.6099, + "num_tokens": 44946783.0, + "reward": 1.56640625, + "reward_std": 0.8047005832195282, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42867646366357803, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.3876604661345482, + "step": 1537, + "token_counts/after_target": 507.0, + "token_counts/after_think": 72.0, + "token_counts/before_target": 1773.5, + "token_counts/before_think": 883.25 + }, + { + "avg_penalty/after_target": 2.837780088186264, + "avg_penalty/after_think": 3.8741718530654907, + "avg_penalty/before_target": 0.30510594695806503, + "avg_penalty/before_think": 0.5042841359972954, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 661.0, + "completions/max_terminated_length": 583.5, + "completions/mean_length": 227.96875, + "completions/mean_terminated_length": 216.9947967529297, + "completions/min_length": 35.75, + "completions/min_terminated_length": 35.75, + "epoch": 0.769, + "grad_norm": 10.242889404296875, + "kl": 28.0, + "learning_rate": 3.091175889231417e-06, + "loss": 2.0973, + "num_tokens": 44972653.0, + "reward": 1.40625, + "reward_std": 0.8638545870780945, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4704566150903702, + "rewards/tag_count_reward/mean": 0.71875, + "rewards/tag_count_reward/std": 0.43032533675432205, + "step": 1538, + "token_counts/after_target": 556.75, + "token_counts/after_think": 84.25, + "token_counts/before_target": 2153.5, + "token_counts/before_think": 853.0 + }, + { + "avg_penalty/after_target": 2.937287390232086, + "avg_penalty/after_think": 0.7701107263565063, + "avg_penalty/before_target": 0.4597502425312996, + "avg_penalty/before_think": 0.5027881562709808, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 851.25, + "completions/max_terminated_length": 780.75, + "completions/mean_length": 292.03125, + "completions/mean_terminated_length": 279.5906295776367, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.7695, + "grad_norm": 13.394960403442383, + "kl": 33.9375, + "learning_rate": 3.0785682612959334e-06, + "loss": 2.5242, + "num_tokens": 45002847.0, + "reward": 1.2578125, + "reward_std": 0.9685544073581696, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.5071863383054733, + "rewards/tag_count_reward/mean": 0.6484375, + "rewards/tag_count_reward/std": 0.44398343563079834, + "step": 1539, + "token_counts/after_target": 1196.0, + "token_counts/after_think": 34.0, + "token_counts/before_target": 2196.0, + "token_counts/before_think": 1246.5 + }, + { + "avg_penalty/after_target": 2.8301824033260345, + "avg_penalty/after_think": 2.7009005546569824, + "avg_penalty/before_target": 0.3378002345561981, + "avg_penalty/before_think": 0.3969230428338051, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.75, + "completions/max_terminated_length": 557.75, + "completions/mean_length": 174.28125, + "completions/mean_terminated_length": 174.28125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.77, + "grad_norm": 4.91602087020874, + "kl": 16.1328125, + "learning_rate": 3.0659817172418694e-06, + "loss": 1.2722, + "num_tokens": 45024337.0, + "reward": 1.671875, + "reward_std": 0.6587941348552704, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.36136941611766815, + "rewards/tag_count_reward/mean": 0.859375, + "rewards/tag_count_reward/std": 0.3093990311026573, + "step": 1540, + "token_counts/after_target": 494.5, + "token_counts/after_think": 10.0, + "token_counts/before_target": 1525.0, + "token_counts/before_think": 759.0 + }, + { + "avg_penalty/after_target": 2.4472429156303406, + "avg_penalty/after_think": 1.3854255676269531, + "avg_penalty/before_target": 0.3660314306616783, + "avg_penalty/before_think": 0.382571067661047, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.75, + "completions/max_terminated_length": 513.75, + "completions/mean_length": 193.703125, + "completions/mean_terminated_length": 193.703125, + "completions/min_length": 42.5, + "completions/min_terminated_length": 42.5, + "epoch": 0.7705, + "grad_norm": 7.75839900970459, + "kl": 24.671875, + "learning_rate": 3.0534162954100264e-06, + "loss": 1.8355, + "num_tokens": 45045166.0, + "reward": 1.41015625, + "reward_std": 0.9239933043718338, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.48148179799318314, + "rewards/tag_count_reward/mean": 0.72265625, + "rewards/tag_count_reward/std": 0.4391625374555588, + "step": 1541, + "token_counts/after_target": 571.0, + "token_counts/after_think": 18.75, + "token_counts/before_target": 1755.5, + "token_counts/before_think": 754.0 + }, + { + "avg_penalty/after_target": 2.8409716188907623, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.37235482782125473, + "avg_penalty/before_think": 0.5722374096512794, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 704.25, + "completions/max_terminated_length": 704.25, + "completions/mean_length": 206.546875, + "completions/mean_terminated_length": 206.546875, + "completions/min_length": 42.25, + "completions/min_terminated_length": 42.25, + "epoch": 0.771, + "grad_norm": 4.769755840301514, + "kl": 22.6875, + "learning_rate": 3.040872034076857e-06, + "loss": 2.0616, + "num_tokens": 45067297.0, + "reward": 1.578125, + "reward_std": 0.7957262992858887, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.43303824216127396, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.377534382045269, + "step": 1542, + "token_counts/after_target": 770.5, + "token_counts/after_think": 25.25, + "token_counts/before_target": 1711.0, + "token_counts/before_think": 798.0 + }, + { + "avg_penalty/after_target": 1.9986751675605774, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.49005352705717087, + "avg_penalty/before_think": 0.4914683736860752, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 657.5, + "completions/max_terminated_length": 657.5, + "completions/mean_length": 274.9375, + "completions/mean_terminated_length": 274.9375, + "completions/min_length": 31.25, + "completions/min_terminated_length": 31.25, + "epoch": 0.7715, + "grad_norm": 19.514436721801758, + "kl": 29.15625, + "learning_rate": 3.028348971454356e-06, + "loss": 1.9357, + "num_tokens": 45095069.0, + "reward": 1.26953125, + "reward_std": 0.9265605509281158, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.49244368076324463, + "rewards/tag_count_reward/mean": 0.66015625, + "rewards/tag_count_reward/std": 0.4512580633163452, + "step": 1543, + "token_counts/after_target": 1020.0, + "token_counts/after_think": 91.5, + "token_counts/before_target": 1963.0, + "token_counts/before_think": 1324.5 + }, + { + "avg_penalty/after_target": 3.3527597784996033, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.38292642310261726, + "avg_penalty/before_think": 0.3443232625722885, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.75, + "completions/max_terminated_length": 572.75, + "completions/mean_length": 172.984375, + "completions/mean_terminated_length": 172.984375, + "completions/min_length": 39.25, + "completions/min_terminated_length": 39.25, + "epoch": 0.772, + "grad_norm": 3.8771469593048096, + "kl": 24.046875, + "learning_rate": 3.015847145689943e-06, + "loss": 2.119, + "num_tokens": 45119532.0, + "reward": 1.515625, + "reward_std": 0.8397034406661987, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.43655145168304443, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.40949371457099915, + "step": 1544, + "token_counts/after_target": 555.5, + "token_counts/after_think": 94.25, + "token_counts/before_target": 1372.5, + "token_counts/before_think": 745.5 + }, + { + "avg_penalty/after_target": 2.8914228081703186, + "avg_penalty/after_think": 1.825841248035431, + "avg_penalty/before_target": 0.33112072944641113, + "avg_penalty/before_think": 0.5379207879304886, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 667.75, + "completions/max_terminated_length": 667.75, + "completions/mean_length": 258.703125, + "completions/mean_terminated_length": 258.703125, + "completions/min_length": 52.25, + "completions/min_terminated_length": 52.25, + "epoch": 0.7725, + "grad_norm": 4.25370979309082, + "kl": 28.25, + "learning_rate": 3.003366594866345e-06, + "loss": 2.3656, + "num_tokens": 45146105.0, + "reward": 1.39453125, + "reward_std": 0.8501640558242798, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.48558124154806137, + "rewards/tag_count_reward/mean": 0.75390625, + "rewards/tag_count_reward/std": 0.39502882212400436, + "step": 1545, + "token_counts/after_target": 1078.0, + "token_counts/after_think": 10.75, + "token_counts/before_target": 1990.75, + "token_counts/before_think": 1059.75 + }, + { + "avg_penalty/after_target": 2.5288268625736237, + "avg_penalty/after_think": 3.432240515947342, + "avg_penalty/before_target": 0.3002711161971092, + "avg_penalty/before_think": 0.46522027254104614, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.75, + "completions/max_terminated_length": 446.75, + "completions/mean_length": 209.84375, + "completions/mean_terminated_length": 209.84375, + "completions/min_length": 54.5, + "completions/min_terminated_length": 54.5, + "epoch": 0.773, + "grad_norm": 3.5969839096069336, + "kl": 13.6640625, + "learning_rate": 2.990907357001491e-06, + "loss": 1.2847, + "num_tokens": 45168319.0, + "reward": 1.71875, + "reward_std": 0.5858210921287537, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.35648179799318314, + "rewards/tag_count_reward/mean": 0.890625, + "rewards/tag_count_reward/std": 0.2499946653842926, + "step": 1546, + "token_counts/after_target": 426.5, + "token_counts/after_think": 82.75, + "token_counts/before_target": 1461.25, + "token_counts/before_think": 1387.0 + }, + { + "avg_penalty/after_target": 2.0628511011600494, + "avg_penalty/after_think": 3.6337375044822693, + "avg_penalty/before_target": 0.42958181351423264, + "avg_penalty/before_think": 0.4362739995121956, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 675.75, + "completions/max_terminated_length": 675.75, + "completions/mean_length": 221.375, + "completions/mean_terminated_length": 221.375, + "completions/min_length": 47.75, + "completions/min_terminated_length": 47.75, + "epoch": 0.7735, + "grad_norm": 2.530318021774292, + "kl": 20.40625, + "learning_rate": 2.978469470048376e-06, + "loss": 1.7566, + "num_tokens": 45190039.0, + "reward": 1.58984375, + "reward_std": 0.8566277027130127, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.43303824216127396, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.39681850373744965, + "step": 1547, + "token_counts/after_target": 642.75, + "token_counts/after_think": 82.5, + "token_counts/before_target": 1788.75, + "token_counts/before_think": 1028.0 + }, + { + "avg_penalty/after_target": 2.4544697403907776, + "avg_penalty/after_think": 0.6227008104324341, + "avg_penalty/before_target": 0.36612262576818466, + "avg_penalty/before_think": 0.456749826669693, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.75, + "completions/max_terminated_length": 592.75, + "completions/mean_length": 228.765625, + "completions/mean_terminated_length": 228.765625, + "completions/min_length": 34.75, + "completions/min_terminated_length": 34.75, + "epoch": 0.774, + "grad_norm": 12.721735954284668, + "kl": 28.28125, + "learning_rate": 2.9660529718949628e-06, + "loss": 1.9774, + "num_tokens": 45218712.0, + "reward": 1.203125, + "reward_std": 0.9589018523693085, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.49606408923864365, + "rewards/tag_count_reward/mean": 0.625, + "rewards/tag_count_reward/std": 0.47785231471061707, + "step": 1548, + "token_counts/after_target": 769.25, + "token_counts/after_think": 2.25, + "token_counts/before_target": 2033.5, + "token_counts/before_think": 855.25 + }, + { + "avg_penalty/after_target": 1.481249451637268, + "avg_penalty/after_think": 3.7285136580467224, + "avg_penalty/before_target": 0.5201003327965736, + "avg_penalty/before_think": 0.5564221441745758, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 658.75, + "completions/max_terminated_length": 658.75, + "completions/mean_length": 259.28125, + "completions/mean_terminated_length": 259.28125, + "completions/min_length": 50.5, + "completions/min_terminated_length": 50.5, + "epoch": 0.7745, + "grad_norm": 3.2519989013671875, + "kl": 20.5546875, + "learning_rate": 2.953657900364053e-06, + "loss": 1.6941, + "num_tokens": 45245818.0, + "reward": 1.56640625, + "reward_std": 0.839728444814682, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.43399807065725327, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.40047411620616913, + "step": 1549, + "token_counts/after_target": 998.5, + "token_counts/after_think": 165.75, + "token_counts/before_target": 2297.75, + "token_counts/before_think": 686.5 + }, + { + "avg_penalty/after_target": 2.3977582454681396, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5422667786478996, + "avg_penalty/before_think": 0.5188959613442421, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 889.5, + "completions/max_terminated_length": 665.0, + "completions/mean_length": 272.859375, + "completions/mean_terminated_length": 246.98438262939453, + "completions/min_length": 29.25, + "completions/min_terminated_length": 29.25, + "epoch": 0.775, + "grad_norm": 4.172231674194336, + "kl": 23.796875, + "learning_rate": 2.9412842932131904e-06, + "loss": 2.1331, + "num_tokens": 45276513.0, + "reward": 1.4765625, + "reward_std": 0.796368882060051, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4260597825050354, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.3844856768846512, + "step": 1550, + "token_counts/after_target": 1287.25, + "token_counts/after_think": 20.75, + "token_counts/before_target": 1830.5, + "token_counts/before_think": 1227.25 + }, + { + "avg_penalty/after_target": 2.5485483407974243, + "avg_penalty/after_think": 2.7326218485832214, + "avg_penalty/before_target": 0.4464426636695862, + "avg_penalty/before_think": 0.3256990425288677, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 683.75, + "completions/max_terminated_length": 554.25, + "completions/mean_length": 200.828125, + "completions/mean_terminated_length": 186.8625030517578, + "completions/min_length": 41.25, + "completions/min_terminated_length": 41.25, + "epoch": 0.7755, + "grad_norm": 6.711849212646484, + "kl": 23.03125, + "learning_rate": 2.9289321881345257e-06, + "loss": 2.1704, + "num_tokens": 45300326.0, + "reward": 1.54296875, + "reward_std": 0.8319050520658493, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.43303824216127396, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.4158431813120842, + "step": 1551, + "token_counts/after_target": 755.0, + "token_counts/after_think": 15.25, + "token_counts/before_target": 1646.75, + "token_counts/before_think": 796.25 + }, + { + "avg_penalty/after_target": 2.4053321480751038, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3670395612716675, + "avg_penalty/before_think": 0.490377739071846, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 713.5, + "completions/max_terminated_length": 599.0, + "completions/mean_length": 234.765625, + "completions/mean_terminated_length": 222.73333740234375, + "completions/min_length": 53.75, + "completions/min_terminated_length": 53.75, + "epoch": 0.776, + "grad_norm": 4.9059247970581055, + "kl": 16.8828125, + "learning_rate": 2.9166016227547135e-06, + "loss": 1.63, + "num_tokens": 45326151.0, + "reward": 1.57421875, + "reward_std": 0.6877022534608841, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.3507782220840454, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.35582026839256287, + "step": 1552, + "token_counts/after_target": 630.75, + "token_counts/after_think": 32.0, + "token_counts/before_target": 1727.25, + "token_counts/before_think": 1366.25 + }, + { + "avg_penalty/after_target": 1.5408422946929932, + "avg_penalty/after_think": 3.196947157382965, + "avg_penalty/before_target": 0.33499564602971077, + "avg_penalty/before_think": 0.5549566522240639, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 153.546875, + "completions/mean_terminated_length": 153.546875, + "completions/min_length": 26.5, + "completions/min_terminated_length": 26.5, + "epoch": 0.7765, + "grad_norm": 3.3056480884552, + "kl": 21.359375, + "learning_rate": 2.9042926346347932e-06, + "loss": 1.7989, + "num_tokens": 45347034.0, + "reward": 1.53125, + "reward_std": 0.7945999652147293, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.43708496540784836, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.37670497223734856, + "step": 1553, + "token_counts/after_target": 220.0, + "token_counts/after_think": 128.5, + "token_counts/before_target": 1320.75, + "token_counts/before_think": 787.5 + }, + { + "avg_penalty/after_target": 2.1863540709018707, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.387860544025898, + "avg_penalty/before_think": 0.46598346531391144, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 619.0, + "completions/max_terminated_length": 619.0, + "completions/mean_length": 233.890625, + "completions/mean_terminated_length": 233.890625, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.777, + "grad_norm": 6.12266731262207, + "kl": 22.53125, + "learning_rate": 2.8920052612700755e-06, + "loss": 2.0259, + "num_tokens": 45371939.0, + "reward": 1.453125, + "reward_std": 0.8130073547363281, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.45129410922527313, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.38118018954992294, + "step": 1554, + "token_counts/after_target": 803.0, + "token_counts/after_think": 22.5, + "token_counts/before_target": 1775.0, + "token_counts/before_think": 1141.75 + }, + { + "avg_penalty/after_target": 1.9728233516216278, + "avg_penalty/after_think": 2.141681522130966, + "avg_penalty/before_target": 0.39720369130373, + "avg_penalty/before_think": 0.4170203320682049, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.25, + "completions/max_terminated_length": 550.25, + "completions/mean_length": 174.0625, + "completions/mean_terminated_length": 174.0625, + "completions/min_length": 42.75, + "completions/min_terminated_length": 42.75, + "epoch": 0.7775, + "grad_norm": 5.691036701202393, + "kl": 16.328125, + "learning_rate": 2.8797395400900362e-06, + "loss": 1.4858, + "num_tokens": 45393527.0, + "reward": 1.6796875, + "reward_std": 0.6829460114240646, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3987511098384857, + "rewards/tag_count_reward/mean": 0.8671875, + "rewards/tag_count_reward/std": 0.29077933356165886, + "step": 1555, + "token_counts/after_target": 451.25, + "token_counts/after_think": 35.0, + "token_counts/before_target": 1111.25, + "token_counts/before_think": 1187.5 + }, + { + "avg_penalty/after_target": 2.4235771000385284, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.39479584991931915, + "avg_penalty/before_think": 0.38586500659585, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.5, + "completions/max_terminated_length": 605.5, + "completions/mean_length": 223.21875, + "completions/mean_terminated_length": 223.21875, + "completions/min_length": 48.5, + "completions/min_terminated_length": 48.5, + "epoch": 0.778, + "grad_norm": 5.207596778869629, + "kl": 21.421875, + "learning_rate": 2.867495508458186e-06, + "loss": 1.7278, + "num_tokens": 45419685.0, + "reward": 1.625, + "reward_std": 0.7847367152571678, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.37675637751817703, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.3707045465707779, + "step": 1556, + "token_counts/after_target": 548.25, + "token_counts/after_think": 20.75, + "token_counts/before_target": 1798.5, + "token_counts/before_think": 1204.0 + }, + { + "avg_penalty/after_target": 1.5485431253910065, + "avg_penalty/after_think": 3.9431872963905334, + "avg_penalty/before_target": 0.4395516961812973, + "avg_penalty/before_think": 0.5702429488301277, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.75, + "completions/max_terminated_length": 573.75, + "completions/mean_length": 211.625, + "completions/mean_terminated_length": 211.625, + "completions/min_length": 31.5, + "completions/min_terminated_length": 31.5, + "epoch": 0.7785, + "grad_norm": 2.6962902545928955, + "kl": 17.921875, + "learning_rate": 2.855273203671969e-06, + "loss": 1.6234, + "num_tokens": 45444253.0, + "reward": 1.57421875, + "reward_std": 0.7875581085681915, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42733466625213623, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.3786344900727272, + "step": 1557, + "token_counts/after_target": 439.0, + "token_counts/after_think": 144.25, + "token_counts/before_target": 1799.0, + "token_counts/before_think": 1003.75 + }, + { + "avg_penalty/after_target": 2.014526218175888, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.29408280923962593, + "avg_penalty/before_think": 0.5674442648887634, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.5, + "completions/max_terminated_length": 433.5, + "completions/mean_length": 164.984375, + "completions/mean_terminated_length": 164.984375, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.779, + "grad_norm": 3.537163496017456, + "kl": 10.765625, + "learning_rate": 2.8430726629626416e-06, + "loss": 1.0702, + "num_tokens": 45464380.0, + "reward": 1.734375, + "reward_std": 0.6985898464918137, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.3454566150903702, + "rewards/tag_count_reward/mean": 0.859375, + "rewards/tag_count_reward/std": 0.32405824959278107, + "step": 1558, + "token_counts/after_target": 246.0, + "token_counts/after_think": 102.0, + "token_counts/before_target": 1204.75, + "token_counts/before_think": 1087.0 + }, + { + "avg_penalty/after_target": 2.3786288797855377, + "avg_penalty/after_think": 1.6925458908081055, + "avg_penalty/before_target": 0.43972477316856384, + "avg_penalty/before_think": 0.5066439732909203, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 583.0, + "completions/max_terminated_length": 583.0, + "completions/mean_length": 173.875, + "completions/mean_terminated_length": 173.875, + "completions/min_length": 38.75, + "completions/min_terminated_length": 38.75, + "epoch": 0.7795, + "grad_norm": 7.707357883453369, + "kl": 19.3125, + "learning_rate": 2.830893923495173e-06, + "loss": 1.8604, + "num_tokens": 45483252.0, + "reward": 1.51171875, + "reward_std": 0.843046173453331, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44938503205776215, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.40427933633327484, + "step": 1559, + "token_counts/after_target": 752.25, + "token_counts/after_think": 5.25, + "token_counts/before_target": 1424.75, + "token_counts/before_think": 599.75 + }, + { + "avg_penalty/after_target": 2.325327843427658, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3125220462679863, + "avg_penalty/before_think": 0.4546410143375397, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 621.25, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 223.5, + "completions/mean_terminated_length": 210.92292022705078, + "completions/min_length": 32.5, + "completions/min_terminated_length": 32.5, + "epoch": 0.78, + "grad_norm": 10.195667266845703, + "kl": 25.09375, + "learning_rate": 2.8187370223681134e-06, + "loss": 1.8273, + "num_tokens": 45507460.0, + "reward": 1.3359375, + "reward_std": 0.8927411139011383, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.4955305755138397, + "rewards/tag_count_reward/mean": 0.7109375, + "rewards/tag_count_reward/std": 0.4234953820705414, + "step": 1560, + "token_counts/after_target": 686.0, + "token_counts/after_think": 6.75, + "token_counts/before_target": 2259.25, + "token_counts/before_think": 624.0 + }, + { + "avg_penalty/after_target": 2.5090124905109406, + "avg_penalty/after_think": 2.8407821655273438, + "avg_penalty/before_target": 0.3066963367164135, + "avg_penalty/before_think": 0.4958350919187069, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.75, + "completions/max_terminated_length": 518.75, + "completions/mean_length": 209.53125, + "completions/mean_terminated_length": 209.53125, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.7805, + "grad_norm": 8.759305953979492, + "kl": 13.859375, + "learning_rate": 2.8066019966134907e-06, + "loss": 1.5185, + "num_tokens": 45531478.0, + "reward": 1.66796875, + "reward_std": 0.6615759134292603, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.87109375, + "rewards/tag_count_reward/std": 0.27991100400686264, + "step": 1561, + "token_counts/after_target": 613.0, + "token_counts/after_think": 76.75, + "token_counts/before_target": 1645.5, + "token_counts/before_think": 1017.25 + }, + { + "avg_penalty/after_target": 2.288949966430664, + "avg_penalty/after_think": 3.841874361038208, + "avg_penalty/before_target": 0.5449154302477837, + "avg_penalty/before_think": 0.7210688889026642, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 677.5, + "completions/max_terminated_length": 665.5, + "completions/mean_length": 253.328125, + "completions/mean_terminated_length": 242.0104217529297, + "completions/min_length": 35.25, + "completions/min_terminated_length": 35.25, + "epoch": 0.781, + "grad_norm": 7.194314002990723, + "kl": 17.515625, + "learning_rate": 2.794488883196699e-06, + "loss": 1.7688, + "num_tokens": 45557867.0, + "reward": 1.65625, + "reward_std": 0.6480631232261658, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4075859263539314, + "rewards/tag_count_reward/mean": 0.875, + "rewards/tag_count_reward/std": 0.276976615190506, + "step": 1562, + "token_counts/after_target": 803.5, + "token_counts/after_think": 191.0, + "token_counts/before_target": 1803.5, + "token_counts/before_think": 1255.25 + }, + { + "avg_penalty/after_target": 3.106360912322998, + "avg_penalty/after_think": 3.723109006881714, + "avg_penalty/before_target": 0.5359189063310623, + "avg_penalty/before_think": 0.3797261230647564, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 820.0, + "completions/max_terminated_length": 647.75, + "completions/mean_length": 265.5, + "completions/mean_terminated_length": 240.80417251586914, + "completions/min_length": 30.75, + "completions/min_terminated_length": 30.75, + "epoch": 0.7815, + "grad_norm": 3.0962867736816406, + "kl": 25.515625, + "learning_rate": 2.7823977190163788e-06, + "loss": 2.1971, + "num_tokens": 45587291.0, + "reward": 1.4140625, + "reward_std": 0.8230289667844772, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4493217319250107, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.40684717148542404, + "step": 1563, + "token_counts/after_target": 1075.5, + "token_counts/after_think": 30.5, + "token_counts/before_target": 2260.25, + "token_counts/before_think": 881.75 + }, + { + "avg_penalty/after_target": 2.391143321990967, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.37573515623807907, + "avg_penalty/before_think": 0.45010263472795486, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 625.0, + "completions/max_terminated_length": 490.25, + "completions/mean_length": 181.90625, + "completions/mean_terminated_length": 167.70208549499512, + "completions/min_length": 26.75, + "completions/min_terminated_length": 26.75, + "epoch": 0.782, + "grad_norm": 3.5999555587768555, + "kl": 18.671875, + "learning_rate": 2.7703285409043192e-06, + "loss": 1.5485, + "num_tokens": 45612469.0, + "reward": 1.64453125, + "reward_std": 0.7231276631355286, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4097762927412987, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.33111046254634857, + "step": 1564, + "token_counts/after_target": 489.5, + "token_counts/after_think": 10.75, + "token_counts/before_target": 1643.25, + "token_counts/before_think": 767.0 + }, + { + "avg_penalty/after_target": 3.442979156970978, + "avg_penalty/after_think": 3.536960184574127, + "avg_penalty/before_target": 0.36714593321084976, + "avg_penalty/before_think": 0.5539379566907883, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.25, + "completions/max_terminated_length": 549.25, + "completions/mean_length": 181.0, + "completions/mean_terminated_length": 181.0, + "completions/min_length": 34.25, + "completions/min_terminated_length": 34.25, + "epoch": 0.7825, + "grad_norm": 4.290114879608154, + "kl": 25.75, + "learning_rate": 2.7582813856253276e-06, + "loss": 2.2925, + "num_tokens": 45634005.0, + "reward": 1.484375, + "reward_std": 0.8341915905475616, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4493217319250107, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.404941126704216, + "step": 1565, + "token_counts/after_target": 693.5, + "token_counts/after_think": 98.25, + "token_counts/before_target": 1590.25, + "token_counts/before_think": 514.0 + }, + { + "avg_penalty/after_target": 3.236303687095642, + "avg_penalty/after_think": 3.822694778442383, + "avg_penalty/before_target": 0.2129621710628271, + "avg_penalty/before_think": 0.47077521681785583, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.5, + "completions/max_terminated_length": 384.5, + "completions/mean_length": 142.6875, + "completions/mean_terminated_length": 142.6875, + "completions/min_length": 34.25, + "completions/min_terminated_length": 34.25, + "epoch": 0.783, + "grad_norm": 14.39961051940918, + "kl": 25.515625, + "learning_rate": 2.746256289877126e-06, + "loss": 1.8226, + "num_tokens": 45652657.0, + "reward": 1.41015625, + "reward_std": 0.8075558543205261, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4314897432923317, + "rewards/tag_count_reward/mean": 0.72265625, + "rewards/tag_count_reward/std": 0.3828711062669754, + "step": 1566, + "token_counts/after_target": 183.25, + "token_counts/after_think": 43.75, + "token_counts/before_target": 1473.0, + "token_counts/before_think": 583.0 + }, + { + "avg_penalty/after_target": 2.6000081598758698, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.5161948353052139, + "avg_penalty/before_think": 0.5177322998642921, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.5, + "completions/max_terminated_length": 677.5, + "completions/mean_length": 207.9375, + "completions/mean_terminated_length": 207.9375, + "completions/min_length": 32.25, + "completions/min_terminated_length": 32.25, + "epoch": 0.7835, + "grad_norm": 7.2219743728637695, + "kl": 23.25, + "learning_rate": 2.7342532902902418e-06, + "loss": 2.2107, + "num_tokens": 45676797.0, + "reward": 1.5234375, + "reward_std": 0.7562536299228668, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4380975142121315, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.3493771404027939, + "step": 1567, + "token_counts/after_target": 860.25, + "token_counts/after_think": 35.0, + "token_counts/before_target": 1385.5, + "token_counts/before_think": 1046.25 + }, + { + "avg_penalty/after_target": 2.849265396595001, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.3980163261294365, + "avg_penalty/before_think": 0.6218320056796074, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 706.75, + "completions/max_terminated_length": 634.25, + "completions/mean_length": 251.125, + "completions/mean_terminated_length": 240.03750610351562, + "completions/min_length": 58.25, + "completions/min_terminated_length": 58.25, + "epoch": 0.784, + "grad_norm": 3.048401355743408, + "kl": 19.15625, + "learning_rate": 2.7222724234278963e-06, + "loss": 1.708, + "num_tokens": 45703173.0, + "reward": 1.5078125, + "reward_std": 0.816577136516571, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.38028906285762787, + "step": 1568, + "token_counts/after_target": 875.25, + "token_counts/after_think": 41.75, + "token_counts/before_target": 1738.25, + "token_counts/before_think": 1362.75 + }, + { + "avg_penalty/after_target": 2.2307607233524323, + "avg_penalty/after_think": 2.7467745542526245, + "avg_penalty/before_target": 0.30471743643283844, + "avg_penalty/before_think": 0.4698238670825958, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 215.875, + "completions/mean_terminated_length": 215.875, + "completions/min_length": 38.75, + "completions/min_terminated_length": 38.75, + "epoch": 0.7845, + "grad_norm": 7.413414001464844, + "kl": 23.140625, + "learning_rate": 2.7103137257858867e-06, + "loss": 1.7616, + "num_tokens": 45728013.0, + "reward": 1.4140625, + "reward_std": 0.8464731276035309, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.45916909724473953, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.414966344833374, + "step": 1569, + "token_counts/after_target": 610.25, + "token_counts/after_think": 67.25, + "token_counts/before_target": 1715.75, + "token_counts/before_think": 1060.75 + }, + { + "avg_penalty/after_target": 2.64374777674675, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3968028351664543, + "avg_penalty/before_think": 0.408619724214077, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.25, + "completions/max_terminated_length": 549.25, + "completions/mean_length": 175.421875, + "completions/mean_terminated_length": 175.421875, + "completions/min_length": 44.5, + "completions/min_terminated_length": 44.5, + "epoch": 0.785, + "grad_norm": 4.7541184425354, + "kl": 23.390625, + "learning_rate": 2.698377233792476e-06, + "loss": 1.9317, + "num_tokens": 45748680.0, + "reward": 1.55078125, + "reward_std": 0.81922248005867, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42867646366357803, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.40112730860710144, + "step": 1570, + "token_counts/after_target": 483.5, + "token_counts/after_think": 40.25, + "token_counts/before_target": 1489.25, + "token_counts/before_think": 793.75 + }, + { + "avg_penalty/after_target": 2.7310346961021423, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.2834995426237583, + "avg_penalty/before_think": 0.46116771176457405, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.5, + "completions/max_terminated_length": 473.5, + "completions/mean_length": 140.65625, + "completions/mean_terminated_length": 140.65625, + "completions/min_length": 31.75, + "completions/min_terminated_length": 31.75, + "epoch": 0.7855, + "grad_norm": 3.9363889694213867, + "kl": 21.625, + "learning_rate": 2.6864629838082957e-06, + "loss": 1.8463, + "num_tokens": 45767778.0, + "reward": 1.55078125, + "reward_std": 0.7547092735767365, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4132782220840454, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.36141638457775116, + "step": 1571, + "token_counts/after_target": 261.25, + "token_counts/after_think": 35.5, + "token_counts/before_target": 1377.5, + "token_counts/before_think": 576.25 + }, + { + "avg_penalty/after_target": 2.1728270947933197, + "avg_penalty/after_think": 2.885690212249756, + "avg_penalty/before_target": 0.4004810154438019, + "avg_penalty/before_think": 0.635009765625, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.5, + "completions/max_terminated_length": 569.5, + "completions/mean_length": 246.578125, + "completions/mean_terminated_length": 246.578125, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.786, + "grad_norm": 9.799699783325195, + "kl": 21.40625, + "learning_rate": 2.6745710121262135e-06, + "loss": 1.5787, + "num_tokens": 45795143.0, + "reward": 1.359375, + "reward_std": 0.8898705840110779, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.48989029973745346, + "rewards/tag_count_reward/mean": 0.71875, + "rewards/tag_count_reward/std": 0.4201068878173828, + "step": 1572, + "token_counts/after_target": 804.0, + "token_counts/after_think": 68.25, + "token_counts/before_target": 2020.5, + "token_counts/before_think": 1052.5 + }, + { + "avg_penalty/after_target": 2.5552306175231934, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.280512236058712, + "avg_penalty/before_think": 0.4874655455350876, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.75, + "completions/max_terminated_length": 505.75, + "completions/mean_length": 194.09375, + "completions/mean_terminated_length": 194.09375, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.7865, + "grad_norm": 5.686315059661865, + "kl": 18.953125, + "learning_rate": 2.6627013549712355e-06, + "loss": 1.4661, + "num_tokens": 45817917.0, + "reward": 1.5, + "reward_std": 0.8008358031511307, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4163651168346405, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.3926089107990265, + "step": 1573, + "token_counts/after_target": 531.25, + "token_counts/after_think": 30.0, + "token_counts/before_target": 1568.25, + "token_counts/before_think": 976.0 + }, + { + "avg_penalty/after_target": 2.4543769359588623, + "avg_penalty/after_think": 2.975459337234497, + "avg_penalty/before_target": 0.38467632234096527, + "avg_penalty/before_think": 0.5119775980710983, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.25, + "completions/max_terminated_length": 565.25, + "completions/mean_length": 163.890625, + "completions/mean_terminated_length": 163.890625, + "completions/min_length": 50.25, + "completions/min_terminated_length": 50.25, + "epoch": 0.787, + "grad_norm": 8.71997356414795, + "kl": 15.625, + "learning_rate": 2.650854048500401e-06, + "loss": 1.5934, + "num_tokens": 45839526.0, + "reward": 1.625, + "reward_std": 0.7057032436132431, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.42516325414180756, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.3389498367905617, + "step": 1574, + "token_counts/after_target": 434.75, + "token_counts/after_think": 138.25, + "token_counts/before_target": 1150.0, + "token_counts/before_think": 899.25 + }, + { + "avg_penalty/after_target": 3.3275474309921265, + "avg_penalty/after_think": 1.9424425959587097, + "avg_penalty/before_target": 0.3675113022327423, + "avg_penalty/before_think": 0.4208584278821945, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 629.5, + "completions/max_terminated_length": 533.0, + "completions/mean_length": 197.328125, + "completions/mean_terminated_length": 184.60520935058594, + "completions/min_length": 35.5, + "completions/min_terminated_length": 35.5, + "epoch": 0.7875, + "grad_norm": 13.492903709411621, + "kl": 18.609375, + "learning_rate": 2.639029128802657e-06, + "loss": 2.0587, + "num_tokens": 45862011.0, + "reward": 1.6640625, + "reward_std": 0.7130295038223267, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.42516325414180756, + "rewards/tag_count_reward/mean": 0.8671875, + "rewards/tag_count_reward/std": 0.28478869423270226, + "step": 1575, + "token_counts/after_target": 687.5, + "token_counts/after_think": 102.25, + "token_counts/before_target": 1418.25, + "token_counts/before_think": 949.25 + }, + { + "avg_penalty/after_target": 3.0481494665145874, + "avg_penalty/after_think": 3.571078658103943, + "avg_penalty/before_target": 0.31650280952453613, + "avg_penalty/before_think": 0.6140966042876244, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.5, + "completions/max_terminated_length": 549.5, + "completions/mean_length": 172.8125, + "completions/mean_terminated_length": 172.8125, + "completions/min_length": 31.5, + "completions/min_terminated_length": 31.5, + "epoch": 0.788, + "grad_norm": 4.277018070220947, + "kl": 20.03125, + "learning_rate": 2.6272266318987606e-06, + "loss": 1.6869, + "num_tokens": 45883455.0, + "reward": 1.52734375, + "reward_std": 0.8235687911510468, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.45508860796689987, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.39064131677150726, + "step": 1576, + "token_counts/after_target": 411.75, + "token_counts/after_think": 100.25, + "token_counts/before_target": 1218.5, + "token_counts/before_think": 1034.5 + }, + { + "avg_penalty/after_target": 3.090199649333954, + "avg_penalty/after_think": 3.6068819165229797, + "avg_penalty/before_target": 0.31811750680208206, + "avg_penalty/before_think": 0.47445350140333176, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 527.0, + "completions/max_terminated_length": 527.0, + "completions/mean_length": 228.109375, + "completions/mean_terminated_length": 228.109375, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.7885, + "grad_norm": 7.904016494750977, + "kl": 22.6875, + "learning_rate": 2.615446593741161e-06, + "loss": 1.7554, + "num_tokens": 45909206.0, + "reward": 1.421875, + "reward_std": 0.898561641573906, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.4840351790189743, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.41901709139347076, + "step": 1577, + "token_counts/after_target": 556.75, + "token_counts/after_think": 124.5, + "token_counts/before_target": 1990.0, + "token_counts/before_think": 978.5 + }, + { + "avg_penalty/after_target": 2.9179049134254456, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4486878365278244, + "avg_penalty/before_think": 0.6525952816009521, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 629.5, + "completions/max_terminated_length": 609.25, + "completions/mean_length": 236.90625, + "completions/mean_terminated_length": 226.20938110351562, + "completions/min_length": 44.25, + "completions/min_terminated_length": 44.25, + "epoch": 0.789, + "grad_norm": 5.687203884124756, + "kl": 19.53125, + "learning_rate": 2.603689050213902e-06, + "loss": 1.8494, + "num_tokens": 45937296.0, + "reward": 1.51953125, + "reward_std": 0.7766944915056229, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4634971097111702, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.34332185238599777, + "step": 1578, + "token_counts/after_target": 879.0, + "token_counts/after_think": 26.0, + "token_counts/before_target": 1650.5, + "token_counts/before_think": 1235.0 + }, + { + "avg_penalty/after_target": 2.0983729660511017, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.36656704545021057, + "avg_penalty/before_think": 0.593407541513443, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.0, + "completions/max_terminated_length": 618.0, + "completions/mean_length": 176.875, + "completions/mean_terminated_length": 176.875, + "completions/min_length": 25.25, + "completions/min_terminated_length": 25.25, + "epoch": 0.7895, + "grad_norm": 5.231654644012451, + "kl": 18.71875, + "learning_rate": 2.5919540371325005e-06, + "loss": 1.7868, + "num_tokens": 45956856.0, + "reward": 1.6328125, + "reward_std": 0.7556060701608658, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4097762927412987, + "rewards/tag_count_reward/mean": 0.8359375, + "rewards/tag_count_reward/std": 0.36213020980358124, + "step": 1579, + "token_counts/after_target": 417.5, + "token_counts/after_think": 89.0, + "token_counts/before_target": 1263.0, + "token_counts/before_think": 1060.5 + }, + { + "avg_penalty/after_target": 2.5898145139217377, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.544821709394455, + "avg_penalty/before_think": 0.29226164519786835, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.0, + "completions/max_terminated_length": 631.0, + "completions/mean_length": 237.8125, + "completions/mean_terminated_length": 237.8125, + "completions/min_length": 40.75, + "completions/min_terminated_length": 40.75, + "epoch": 0.79, + "grad_norm": 5.7879719734191895, + "kl": 27.25, + "learning_rate": 2.5802415902438373e-06, + "loss": 2.1908, + "num_tokens": 45981164.0, + "reward": 1.29296875, + "reward_std": 0.8712650090456009, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.609375, + "rewards/format_reward/std": 0.4776429533958435, + "rewards/tag_count_reward/mean": 0.68359375, + "rewards/tag_count_reward/std": 0.42146123945713043, + "step": 1580, + "token_counts/after_target": 1220.25, + "token_counts/after_think": 12.75, + "token_counts/before_target": 1664.25, + "token_counts/before_think": 907.75 + }, + { + "avg_penalty/after_target": 2.292589783668518, + "avg_penalty/after_think": 2.973611891269684, + "avg_penalty/before_target": 0.5036320760846138, + "avg_penalty/before_think": 0.6407106667757034, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 691.25, + "completions/max_terminated_length": 691.25, + "completions/mean_length": 203.859375, + "completions/mean_terminated_length": 203.859375, + "completions/min_length": 45.75, + "completions/min_terminated_length": 45.75, + "epoch": 0.7905, + "grad_norm": 5.254319667816162, + "kl": 20.578125, + "learning_rate": 2.5685517452260566e-06, + "loss": 1.9105, + "num_tokens": 46003123.0, + "reward": 1.52734375, + "reward_std": 0.7770961076021194, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44187305867671967, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.37989504635334015, + "step": 1581, + "token_counts/after_target": 898.25, + "token_counts/after_think": 66.75, + "token_counts/before_target": 1283.5, + "token_counts/before_think": 1013.25 + }, + { + "avg_penalty/after_target": 1.6814091205596924, + "avg_penalty/after_think": 3.3650450706481934, + "avg_penalty/before_target": 0.4004840701818466, + "avg_penalty/before_think": 0.699205681681633, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.5, + "completions/max_terminated_length": 467.5, + "completions/mean_length": 162.5, + "completions/mean_terminated_length": 162.5, + "completions/min_length": 36.25, + "completions/min_terminated_length": 36.25, + "epoch": 0.791, + "grad_norm": 4.839038372039795, + "kl": 18.2421875, + "learning_rate": 2.556884537688459e-06, + "loss": 1.5172, + "num_tokens": 46024339.0, + "reward": 1.6015625, + "reward_std": 0.7345343455672264, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4079566150903702, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.3520274683833122, + "step": 1582, + "token_counts/after_target": 411.5, + "token_counts/after_think": 91.25, + "token_counts/before_target": 1200.0, + "token_counts/before_think": 897.25 + }, + { + "avg_penalty/after_target": 2.447054475545883, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5236950218677521, + "avg_penalty/before_think": 0.6044242158532143, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.25, + "completions/max_terminated_length": 599.25, + "completions/mean_length": 215.21875, + "completions/mean_terminated_length": 215.21875, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.7915, + "grad_norm": 3.137507438659668, + "kl": 23.8125, + "learning_rate": 2.5452400031713786e-06, + "loss": 2.1374, + "num_tokens": 46047329.0, + "reward": 1.51171875, + "reward_std": 0.8230248242616653, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4604102149605751, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.38813333213329315, + "step": 1583, + "token_counts/after_target": 829.75, + "token_counts/after_think": 93.5, + "token_counts/before_target": 1345.5, + "token_counts/before_think": 1174.75 + }, + { + "avg_penalty/after_target": 3.256295472383499, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.29717717692255974, + "avg_penalty/before_think": 0.3332443833351135, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.5, + "completions/max_terminated_length": 470.5, + "completions/mean_length": 164.109375, + "completions/mean_terminated_length": 164.109375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.792, + "grad_norm": 7.5992255210876465, + "kl": 22.3125, + "learning_rate": 2.5336181771460877e-06, + "loss": 2.0673, + "num_tokens": 46065912.0, + "reward": 1.55859375, + "reward_std": 0.7720885425806046, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4519384130835533, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.3449520021677017, + "step": 1584, + "token_counts/after_target": 479.5, + "token_counts/after_think": 15.0, + "token_counts/before_target": 1534.25, + "token_counts/before_think": 597.0 + }, + { + "avg_penalty/after_target": 2.235379010438919, + "avg_penalty/after_think": 3.754685938358307, + "avg_penalty/before_target": 0.4460237994790077, + "avg_penalty/before_think": 0.5671273171901703, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 828.25, + "completions/max_terminated_length": 766.0, + "completions/mean_length": 276.71875, + "completions/mean_terminated_length": 264.60729598999023, + "completions/min_length": 48.75, + "completions/min_terminated_length": 48.75, + "epoch": 0.7925, + "grad_norm": 9.19129753112793, + "kl": 24.53125, + "learning_rate": 2.522019095014683e-06, + "loss": 1.8303, + "num_tokens": 46096150.0, + "reward": 1.39453125, + "reward_std": 0.8568293005228043, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.4776429533958435, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.3818703964352608, + "step": 1585, + "token_counts/after_target": 886.5, + "token_counts/after_think": 122.5, + "token_counts/before_target": 1763.75, + "token_counts/before_think": 1654.75 + }, + { + "avg_penalty/after_target": 3.0338192880153656, + "avg_penalty/after_think": 2.2262392044067383, + "avg_penalty/before_target": 0.3333664648234844, + "avg_penalty/before_think": 0.6129998117685318, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 672.25, + "completions/max_terminated_length": 672.25, + "completions/mean_length": 230.203125, + "completions/mean_terminated_length": 230.203125, + "completions/min_length": 39.5, + "completions/min_terminated_length": 39.5, + "epoch": 0.793, + "grad_norm": 8.585885047912598, + "kl": 27.875, + "learning_rate": 2.5104427921099783e-06, + "loss": 2.1525, + "num_tokens": 46122051.0, + "reward": 1.4296875, + "reward_std": 0.8482868820428848, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.48148179799318314, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.37428510934114456, + "step": 1586, + "token_counts/after_target": 688.75, + "token_counts/after_think": 122.0, + "token_counts/before_target": 1960.5, + "token_counts/before_think": 912.0 + }, + { + "avg_penalty/after_target": 3.426599860191345, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.28942621499300003, + "avg_penalty/before_think": 0.44960299879312515, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 677.5, + "completions/max_terminated_length": 541.75, + "completions/mean_length": 229.03125, + "completions/mean_terminated_length": 215.41354370117188, + "completions/min_length": 50.25, + "completions/min_terminated_length": 50.25, + "epoch": 0.7935, + "grad_norm": 5.714049816131592, + "kl": 26.140625, + "learning_rate": 2.4988893036954045e-06, + "loss": 2.098, + "num_tokens": 46147221.0, + "reward": 1.44140625, + "reward_std": 0.8711995333433151, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.46513500809669495, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.4144620969891548, + "step": 1587, + "token_counts/after_target": 851.5, + "token_counts/after_think": 10.75, + "token_counts/before_target": 1956.25, + "token_counts/before_think": 846.0 + }, + { + "avg_penalty/after_target": 2.245797425508499, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.6057090535759926, + "avg_penalty/before_think": 0.36583710834383965, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 861.0, + "completions/max_terminated_length": 755.75, + "completions/mean_length": 256.609375, + "completions/mean_terminated_length": 245.3375015258789, + "completions/min_length": 45.5, + "completions/min_terminated_length": 45.5, + "epoch": 0.794, + "grad_norm": 9.010368347167969, + "kl": 36.65625, + "learning_rate": 2.4873586649648896e-06, + "loss": 2.8576, + "num_tokens": 46173628.0, + "reward": 1.23046875, + "reward_std": 0.9305475950241089, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.578125, + "rewards/format_reward/std": 0.5071863383054733, + "rewards/tag_count_reward/mean": 0.65234375, + "rewards/tag_count_reward/std": 0.44920850545167923, + "step": 1588, + "token_counts/after_target": 1330.5, + "token_counts/after_think": 31.0, + "token_counts/before_target": 2254.0, + "token_counts/before_think": 490.25 + }, + { + "avg_penalty/after_target": 2.814089357852936, + "avg_penalty/after_think": 3.4428176879882812, + "avg_penalty/before_target": 0.25527558475732803, + "avg_penalty/before_think": 0.5644755735993385, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.5, + "completions/max_terminated_length": 382.5, + "completions/mean_length": 167.40625, + "completions/mean_terminated_length": 167.40625, + "completions/min_length": 44.5, + "completions/min_terminated_length": 44.5, + "epoch": 0.7945, + "grad_norm": 5.950497627258301, + "kl": 18.84375, + "learning_rate": 2.4758509110427576e-06, + "loss": 1.5041, + "num_tokens": 46196390.0, + "reward": 1.6171875, + "reward_std": 0.8634839504957199, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.11967839300632477, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4440634250640869, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.38738394528627396, + "step": 1589, + "token_counts/after_target": 239.25, + "token_counts/after_think": 69.25, + "token_counts/before_target": 1437.5, + "token_counts/before_think": 932.5 + }, + { + "avg_penalty/after_target": 2.313378632068634, + "avg_penalty/after_think": 1.9264469742774963, + "avg_penalty/before_target": 0.4005994163453579, + "avg_penalty/before_think": 0.45170335844159126, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 697.25, + "completions/max_terminated_length": 697.25, + "completions/mean_length": 253.140625, + "completions/mean_terminated_length": 253.140625, + "completions/min_length": 42.5, + "completions/min_terminated_length": 42.5, + "epoch": 0.795, + "grad_norm": 4.116499423980713, + "kl": 24.28125, + "learning_rate": 2.464366076983623e-06, + "loss": 1.9948, + "num_tokens": 46223119.0, + "reward": 1.3828125, + "reward_std": 0.8570199906826019, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.48558124154806137, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.39747343212366104, + "step": 1590, + "token_counts/after_target": 938.0, + "token_counts/after_think": 19.0, + "token_counts/before_target": 1745.25, + "token_counts/before_think": 1348.0 + }, + { + "avg_penalty/after_target": 2.3242330849170685, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.30568068102002144, + "avg_penalty/before_think": 0.6317690089344978, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.5, + "completions/max_terminated_length": 537.5, + "completions/mean_length": 202.359375, + "completions/mean_terminated_length": 202.359375, + "completions/min_length": 51.75, + "completions/min_terminated_length": 51.75, + "epoch": 0.7955, + "grad_norm": 6.574371814727783, + "kl": 24.265625, + "learning_rate": 2.45290419777228e-06, + "loss": 1.9061, + "num_tokens": 46244262.0, + "reward": 1.46875, + "reward_std": 0.8274869173765182, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4550696536898613, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.39092259854078293, + "step": 1591, + "token_counts/after_target": 455.5, + "token_counts/after_think": 111.0, + "token_counts/before_target": 2011.75, + "token_counts/before_think": 659.5 + }, + { + "avg_penalty/after_target": 2.9586164951324463, + "avg_penalty/after_think": 2.5625612139701843, + "avg_penalty/before_target": 0.46315842494368553, + "avg_penalty/before_think": 0.5022822096943855, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 711.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 235.8125, + "completions/mean_terminated_length": 224.50729370117188, + "completions/min_length": 45.5, + "completions/min_terminated_length": 45.5, + "epoch": 0.796, + "grad_norm": 2.95924973487854, + "kl": 25.71875, + "learning_rate": 2.441465308323605e-06, + "loss": 2.187, + "num_tokens": 46269498.0, + "reward": 1.3203125, + "reward_std": 0.917642280459404, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.48989029973745346, + "rewards/tag_count_reward/mean": 0.6796875, + "rewards/tag_count_reward/std": 0.4389082044363022, + "step": 1592, + "token_counts/after_target": 996.75, + "token_counts/after_think": 131.25, + "token_counts/before_target": 1726.75, + "token_counts/before_think": 918.25 + }, + { + "avg_penalty/after_target": 2.9476612210273743, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.2514093220233917, + "avg_penalty/before_think": 0.33680517971515656, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 189.859375, + "completions/mean_terminated_length": 189.859375, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.7965, + "grad_norm": 4.476290225982666, + "kl": 23.828125, + "learning_rate": 2.4300494434824373e-06, + "loss": 1.9266, + "num_tokens": 46289425.0, + "reward": 1.484375, + "reward_std": 0.8374814242124557, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4704566150903702, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.38039691001176834, + "step": 1593, + "token_counts/after_target": 412.5, + "token_counts/after_think": 20.25, + "token_counts/before_target": 1910.5, + "token_counts/before_think": 694.5 + }, + { + "avg_penalty/after_target": 2.8876324594020844, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.378982562571764, + "avg_penalty/before_think": 0.43835053592920303, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 608.5, + "completions/max_terminated_length": 608.5, + "completions/mean_length": 201.359375, + "completions/mean_terminated_length": 201.359375, + "completions/min_length": 46.75, + "completions/min_terminated_length": 46.75, + "epoch": 0.797, + "grad_norm": 3.6940934658050537, + "kl": 22.171875, + "learning_rate": 2.41865663802348e-06, + "loss": 1.834, + "num_tokens": 46313288.0, + "reward": 1.47265625, + "reward_std": 0.8214392215013504, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.46513500809669495, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.3755294233560562, + "step": 1594, + "token_counts/after_target": 622.75, + "token_counts/after_think": 14.75, + "token_counts/before_target": 1581.5, + "token_counts/before_think": 1002.75 + }, + { + "avg_penalty/after_target": 2.4960648715496063, + "avg_penalty/after_think": 2.888257622718811, + "avg_penalty/before_target": 0.5538024976849556, + "avg_penalty/before_think": 0.5431288108229637, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.25, + "completions/max_terminated_length": 677.25, + "completions/mean_length": 241.078125, + "completions/mean_terminated_length": 241.078125, + "completions/min_length": 42.5, + "completions/min_terminated_length": 42.5, + "epoch": 0.7975, + "grad_norm": 5.8146586418151855, + "kl": 26.140625, + "learning_rate": 2.407286926651192e-06, + "loss": 2.3525, + "num_tokens": 46338029.0, + "reward": 1.375, + "reward_std": 0.8578853160142899, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.4622559919953346, + "rewards/tag_count_reward/mean": 0.71875, + "rewards/tag_count_reward/std": 0.4119056463241577, + "step": 1595, + "token_counts/after_target": 970.25, + "token_counts/after_think": 145.25, + "token_counts/before_target": 1628.0, + "token_counts/before_think": 1113.75 + }, + { + "avg_penalty/after_target": 2.1464564204216003, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4452742226421833, + "avg_penalty/before_think": 0.47234828025102615, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.0, + "completions/max_terminated_length": 631.0, + "completions/mean_length": 221.5625, + "completions/mean_terminated_length": 221.5625, + "completions/min_length": 42.5, + "completions/min_terminated_length": 42.5, + "epoch": 0.798, + "grad_norm": 2.8386261463165283, + "kl": 21.421875, + "learning_rate": 2.395940343999691e-06, + "loss": 1.8384, + "num_tokens": 46363393.0, + "reward": 1.44921875, + "reward_std": 0.853690966963768, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4493217319250107, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.4093950390815735, + "step": 1596, + "token_counts/after_target": 891.0, + "token_counts/after_think": 42.25, + "token_counts/before_target": 1756.75, + "token_counts/before_think": 855.0 + }, + { + "avg_penalty/after_target": 2.6508015990257263, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.42476869001984596, + "avg_penalty/before_think": 0.5152252912521362, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 590.75, + "completions/max_terminated_length": 538.25, + "completions/mean_length": 186.21875, + "completions/mean_terminated_length": 174.01250457763672, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.7985, + "grad_norm": 10.441205978393555, + "kl": 16.796875, + "learning_rate": 2.3846169246326345e-06, + "loss": 1.8022, + "num_tokens": 46382815.0, + "reward": 1.625, + "reward_std": 0.7501364946365356, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.3516014367341995, + "step": 1597, + "token_counts/after_target": 732.5, + "token_counts/after_think": 36.0, + "token_counts/before_target": 1577.75, + "token_counts/before_think": 633.25 + }, + { + "avg_penalty/after_target": 2.577845185995102, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.32426609098911285, + "avg_penalty/before_think": 0.4902694299817085, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 621.5, + "completions/max_terminated_length": 621.5, + "completions/mean_length": 217.03125, + "completions/mean_terminated_length": 217.03125, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.799, + "grad_norm": 4.613335132598877, + "kl": 23.875, + "learning_rate": 2.3733167030431194e-06, + "loss": 2.1346, + "num_tokens": 46407713.0, + "reward": 1.43359375, + "reward_std": 0.8661565780639648, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4713720977306366, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.41327589750289917, + "step": 1598, + "token_counts/after_target": 756.0, + "token_counts/after_think": 51.0, + "token_counts/before_target": 1944.0, + "token_counts/before_think": 721.5 + }, + { + "avg_penalty/after_target": 2.6303825676441193, + "avg_penalty/after_think": 3.155268907546997, + "avg_penalty/before_target": 0.3838491812348366, + "avg_penalty/before_think": 0.3422418609261513, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 658.25, + "completions/max_terminated_length": 516.75, + "completions/mean_length": 182.3125, + "completions/mean_terminated_length": 168.79479217529297, + "completions/min_length": 36.25, + "completions/min_terminated_length": 36.25, + "epoch": 0.7995, + "grad_norm": 12.462699890136719, + "kl": 19.046875, + "learning_rate": 2.362039713653581e-06, + "loss": 2.0273, + "num_tokens": 46430901.0, + "reward": 1.6484375, + "reward_std": 0.7505306303501129, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3987511098384857, + "rewards/tag_count_reward/mean": 0.8359375, + "rewards/tag_count_reward/std": 0.3598056063055992, + "step": 1599, + "token_counts/after_target": 582.25, + "token_counts/after_think": 82.5, + "token_counts/before_target": 1468.5, + "token_counts/before_think": 783.75 + }, + { + "avg_penalty/after_target": 2.915225923061371, + "avg_penalty/after_think": 2.609705328941345, + "avg_penalty/before_target": 0.42262690141797066, + "avg_penalty/before_think": 0.5190161243081093, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 627.0, + "completions/max_terminated_length": 627.0, + "completions/mean_length": 207.953125, + "completions/mean_terminated_length": 207.953125, + "completions/min_length": 38.25, + "completions/min_terminated_length": 38.25, + "epoch": 0.8, + "grad_norm": 4.906808376312256, + "kl": 24.9375, + "learning_rate": 2.3507859908156828e-06, + "loss": 2.2799, + "num_tokens": 46453666.0, + "reward": 1.4765625, + "reward_std": 0.8550251722335815, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4598134011030197, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.40888791531324387, + "step": 1600, + "token_counts/after_target": 859.5, + "token_counts/after_think": 49.75, + "token_counts/before_target": 1941.25, + "token_counts/before_think": 476.75 + }, + { + "avg_penalty/after_target": 1.9646906554698944, + "avg_penalty/after_think": 2.668147623538971, + "avg_penalty/before_target": 0.40106916800141335, + "avg_penalty/before_think": 0.39776670187711716, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 159.578125, + "completions/mean_terminated_length": 159.578125, + "completions/min_length": 47.25, + "completions/min_terminated_length": 47.25, + "epoch": 0.8005, + "grad_norm": 6.710657596588135, + "kl": 19.984375, + "learning_rate": 2.339555568810221e-06, + "loss": 1.9163, + "num_tokens": 46474375.0, + "reward": 1.56640625, + "reward_std": 0.7822587341070175, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4255262687802315, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.3764783963561058, + "step": 1601, + "token_counts/after_target": 334.5, + "token_counts/after_think": 130.5, + "token_counts/before_target": 1391.75, + "token_counts/before_think": 696.5 + }, + { + "avg_penalty/after_target": 2.3063048720359802, + "avg_penalty/after_think": 3.9474856853485107, + "avg_penalty/before_target": 0.2926691882312298, + "avg_penalty/before_think": 0.48913194984197617, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 191.875, + "completions/mean_terminated_length": 191.875, + "completions/min_length": 42.75, + "completions/min_terminated_length": 42.75, + "epoch": 0.801, + "grad_norm": 7.304980278015137, + "kl": 22.65625, + "learning_rate": 2.328348481847006e-06, + "loss": 1.739, + "num_tokens": 46497439.0, + "reward": 1.390625, + "reward_std": 0.8615321815013885, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.466681070625782, + "rewards/tag_count_reward/mean": 0.71875, + "rewards/tag_count_reward/std": 0.4071408435702324, + "step": 1602, + "token_counts/after_target": 268.0, + "token_counts/after_think": 16.75, + "token_counts/before_target": 2209.75, + "token_counts/before_think": 575.5 + }, + { + "avg_penalty/after_target": 2.182360500097275, + "avg_penalty/after_think": 3.942608416080475, + "avg_penalty/before_target": 0.46884824335575104, + "avg_penalty/before_think": 0.6562593728303909, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.5, + "completions/max_terminated_length": 703.5, + "completions/mean_length": 235.34375, + "completions/mean_terminated_length": 235.34375, + "completions/min_length": 45.25, + "completions/min_terminated_length": 45.25, + "epoch": 0.8015, + "grad_norm": 10.262928009033203, + "kl": 23.875, + "learning_rate": 2.317164764064769e-06, + "loss": 2.3585, + "num_tokens": 46523109.0, + "reward": 1.5546875, + "reward_std": 0.7942023277282715, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.43655145168304443, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.3747686743736267, + "step": 1603, + "token_counts/after_target": 814.0, + "token_counts/after_think": 120.5, + "token_counts/before_target": 2096.5, + "token_counts/before_think": 734.5 + }, + { + "avg_penalty/after_target": 2.606171041727066, + "avg_penalty/after_think": 3.732966423034668, + "avg_penalty/before_target": 0.3295665495097637, + "avg_penalty/before_think": 0.4793291836977005, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.0, + "completions/max_terminated_length": 551.0, + "completions/mean_length": 190.609375, + "completions/mean_terminated_length": 190.609375, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.802, + "grad_norm": 12.14499282836914, + "kl": 14.734375, + "learning_rate": 2.3060044495310507e-06, + "loss": 1.7009, + "num_tokens": 46544492.0, + "reward": 1.671875, + "reward_std": 0.6432940810918808, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.39476002007722855, + "rewards/tag_count_reward/mean": 0.875, + "rewards/tag_count_reward/std": 0.27346450090408325, + "step": 1604, + "token_counts/after_target": 559.25, + "token_counts/after_think": 233.25, + "token_counts/before_target": 1204.0, + "token_counts/before_think": 1053.25 + }, + { + "avg_penalty/after_target": 3.168127119541168, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4186104014515877, + "avg_penalty/before_think": 0.4191965311765671, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 643.25, + "completions/max_terminated_length": 643.25, + "completions/mean_length": 210.9375, + "completions/mean_terminated_length": 210.9375, + "completions/min_length": 30.5, + "completions/min_terminated_length": 30.5, + "epoch": 0.8025, + "grad_norm": 2.737121105194092, + "kl": 24.21875, + "learning_rate": 2.2948675722421086e-06, + "loss": 2.1243, + "num_tokens": 46567848.0, + "reward": 1.3984375, + "reward_std": 0.836240753531456, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.42739029973745346, + "rewards/tag_count_reward/mean": 0.7109375, + "rewards/tag_count_reward/std": 0.4185362383723259, + "step": 1605, + "token_counts/after_target": 818.0, + "token_counts/after_think": 54.25, + "token_counts/before_target": 1691.25, + "token_counts/before_think": 811.5 + }, + { + "avg_penalty/after_target": 2.203382194042206, + "avg_penalty/after_think": 3.8678979873657227, + "avg_penalty/before_target": 0.36840618401765823, + "avg_penalty/before_think": 0.4988759756088257, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.25, + "completions/max_terminated_length": 484.25, + "completions/mean_length": 188.984375, + "completions/mean_terminated_length": 188.984375, + "completions/min_length": 35.75, + "completions/min_terminated_length": 35.75, + "epoch": 0.803, + "grad_norm": 3.867243528366089, + "kl": 23.46875, + "learning_rate": 2.2837541661228024e-06, + "loss": 1.8995, + "num_tokens": 46590183.0, + "reward": 1.51953125, + "reward_std": 0.8300445526838303, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4462348371744156, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.39791369438171387, + "step": 1606, + "token_counts/after_target": 425.0, + "token_counts/after_think": 39.0, + "token_counts/before_target": 1789.25, + "token_counts/before_think": 770.5 + }, + { + "avg_penalty/after_target": 1.7865589559078217, + "avg_penalty/after_think": 2.5263513922691345, + "avg_penalty/before_target": 0.3473866134881973, + "avg_penalty/before_think": 0.36832164973020554, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 590.75, + "completions/max_terminated_length": 434.25, + "completions/mean_length": 149.453125, + "completions/mean_terminated_length": 135.38750076293945, + "completions/min_length": 41.5, + "completions/min_terminated_length": 41.5, + "epoch": 0.8035, + "grad_norm": 3.928579330444336, + "kl": 25.59375, + "learning_rate": 2.27266426502649e-06, + "loss": 2.0937, + "num_tokens": 46609844.0, + "reward": 1.58984375, + "reward_std": 0.7980359494686127, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.3986557498574257, + "step": 1607, + "token_counts/after_target": 280.0, + "token_counts/after_think": 33.25, + "token_counts/before_target": 1461.75, + "token_counts/before_think": 616.25 + }, + { + "avg_penalty/after_target": 3.2428465485572815, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.326054897159338, + "avg_penalty/before_think": 0.39720357209444046, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 663.25, + "completions/max_terminated_length": 663.25, + "completions/mean_length": 226.875, + "completions/mean_terminated_length": 226.875, + "completions/min_length": 47.75, + "completions/min_terminated_length": 47.75, + "epoch": 0.804, + "grad_norm": 5.162264823913574, + "kl": 21.953125, + "learning_rate": 2.261597902734939e-06, + "loss": 2.0358, + "num_tokens": 46633084.0, + "reward": 1.55859375, + "reward_std": 0.7917466312646866, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.41503459960222244, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.4032464176416397, + "step": 1608, + "token_counts/after_target": 831.5, + "token_counts/after_think": 25.0, + "token_counts/before_target": 1841.25, + "token_counts/before_think": 932.25 + }, + { + "avg_penalty/after_target": 2.678932249546051, + "avg_penalty/after_think": 1.8678512573242188, + "avg_penalty/before_target": 0.4456000253558159, + "avg_penalty/before_think": 0.4954841211438179, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 664.75, + "completions/max_terminated_length": 550.5, + "completions/mean_length": 216.90625, + "completions/mean_terminated_length": 191.99553680419922, + "completions/min_length": 39.75, + "completions/min_terminated_length": 39.75, + "epoch": 0.8045, + "grad_norm": 7.532408714294434, + "kl": 25.09375, + "learning_rate": 2.2505551129582047e-06, + "loss": 2.269, + "num_tokens": 46654630.0, + "reward": 1.53515625, + "reward_std": 0.8590694665908813, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4308478757739067, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.4158165082335472, + "step": 1609, + "token_counts/after_target": 879.0, + "token_counts/after_think": 50.5, + "token_counts/before_target": 1971.5, + "token_counts/before_think": 569.5 + }, + { + "avg_penalty/after_target": 2.1195154786109924, + "avg_penalty/after_think": 1.7164708375930786, + "avg_penalty/before_target": 0.4348263368010521, + "avg_penalty/before_think": 0.34341610968112946, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.75, + "completions/max_terminated_length": 611.75, + "completions/mean_length": 170.375, + "completions/mean_terminated_length": 170.375, + "completions/min_length": 24.5, + "completions/min_terminated_length": 24.5, + "epoch": 0.805, + "grad_norm": 9.765463829040527, + "kl": 19.984375, + "learning_rate": 2.2395359293345396e-06, + "loss": 1.9523, + "num_tokens": 46677006.0, + "reward": 1.66796875, + "reward_std": 0.8139618188142776, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4141380712389946, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.3754679039120674, + "step": 1610, + "token_counts/after_target": 521.5, + "token_counts/after_think": 67.75, + "token_counts/before_target": 1360.5, + "token_counts/before_think": 776.25 + }, + { + "avg_penalty/after_target": 2.028704732656479, + "avg_penalty/after_think": 1.355483889579773, + "avg_penalty/before_target": 0.5916302874684334, + "avg_penalty/before_think": 0.4987782686948776, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 716.25, + "completions/max_terminated_length": 617.5, + "completions/mean_length": 222.3125, + "completions/mean_terminated_length": 210.51771545410156, + "completions/min_length": 31.25, + "completions/min_terminated_length": 31.25, + "epoch": 0.8055, + "grad_norm": 6.881211280822754, + "kl": 19.625, + "learning_rate": 2.2285403854302912e-06, + "loss": 1.8765, + "num_tokens": 46701506.0, + "reward": 1.46484375, + "reward_std": 0.824538454413414, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.46566852182149887, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.3942726328969002, + "step": 1611, + "token_counts/after_target": 1176.5, + "token_counts/after_think": 7.5, + "token_counts/before_target": 1479.75, + "token_counts/before_think": 893.25 + }, + { + "avg_penalty/after_target": 2.1210087537765503, + "avg_penalty/after_think": 2.6710010766983032, + "avg_penalty/before_target": 0.4072243943810463, + "avg_penalty/before_think": 0.4205974116921425, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 554.75, + "completions/max_terminated_length": 554.75, + "completions/mean_length": 201.765625, + "completions/mean_terminated_length": 201.765625, + "completions/min_length": 50.75, + "completions/min_terminated_length": 50.75, + "epoch": 0.806, + "grad_norm": 4.422877788543701, + "kl": 18.3291015625, + "learning_rate": 2.2175685147397906e-06, + "loss": 1.7345, + "num_tokens": 46721987.0, + "reward": 1.6015625, + "reward_std": 0.6439367085695267, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.3375816270709038, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.3228202164173126, + "step": 1612, + "token_counts/after_target": 658.75, + "token_counts/after_think": 152.75, + "token_counts/before_target": 1556.5, + "token_counts/before_think": 860.25 + }, + { + "avg_penalty/after_target": 2.801293730735779, + "avg_penalty/after_think": 1.9235325455665588, + "avg_penalty/before_target": 0.3241215720772743, + "avg_penalty/before_think": 0.4371239244937897, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 619.0, + "completions/max_terminated_length": 619.0, + "completions/mean_length": 211.703125, + "completions/mean_terminated_length": 211.703125, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.8065, + "grad_norm": 4.465694904327393, + "kl": 23.59375, + "learning_rate": 2.206620350685257e-06, + "loss": 1.9433, + "num_tokens": 46744576.0, + "reward": 1.48828125, + "reward_std": 0.8751704692840576, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4471946656703949, + "rewards/tag_count_reward/mean": 0.75390625, + "rewards/tag_count_reward/std": 0.41920673847198486, + "step": 1613, + "token_counts/after_target": 484.25, + "token_counts/after_think": 75.75, + "token_counts/before_target": 1774.75, + "token_counts/before_think": 1052.5 + }, + { + "avg_penalty/after_target": 2.4723143577575684, + "avg_penalty/after_think": 2.977476954460144, + "avg_penalty/before_target": 0.6578270271420479, + "avg_penalty/before_think": 0.4817312881350517, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 716.75, + "completions/max_terminated_length": 638.5, + "completions/mean_length": 178.453125, + "completions/mean_terminated_length": 164.73854446411133, + "completions/min_length": 36.5, + "completions/min_terminated_length": 36.5, + "epoch": 0.807, + "grad_norm": 18.585729598999023, + "kl": 19.984375, + "learning_rate": 2.195695926616702e-06, + "loss": 2.3926, + "num_tokens": 46766237.0, + "reward": 1.70703125, + "reward_std": 0.655770406126976, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38336414843797684, + "rewards/tag_count_reward/mean": 0.87890625, + "rewards/tag_count_reward/std": 0.28620753064751625, + "step": 1614, + "token_counts/after_target": 824.0, + "token_counts/after_think": 115.0, + "token_counts/before_target": 1074.0, + "token_counts/before_think": 842.25 + }, + { + "avg_penalty/after_target": 2.6321601271629333, + "avg_penalty/after_think": 3.421240895986557, + "avg_penalty/before_target": 0.4467647820711136, + "avg_penalty/before_think": 0.5665005743503571, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.75, + "completions/max_terminated_length": 628.75, + "completions/mean_length": 230.328125, + "completions/mean_terminated_length": 230.328125, + "completions/min_length": 52.25, + "completions/min_terminated_length": 52.25, + "epoch": 0.8075, + "grad_norm": 4.326373100280762, + "kl": 21.390625, + "learning_rate": 2.1847952758118118e-06, + "loss": 1.8117, + "num_tokens": 46790818.0, + "reward": 1.6171875, + "reward_std": 0.7688266187906265, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4097762927412987, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.3659607917070389, + "step": 1615, + "token_counts/after_target": 799.0, + "token_counts/after_think": 135.0, + "token_counts/before_target": 1334.25, + "token_counts/before_think": 1417.0 + }, + { + "avg_penalty/after_target": 2.802486389875412, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.29180674627423286, + "avg_penalty/before_think": 0.44677821546792984, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 728.75, + "completions/max_terminated_length": 641.5, + "completions/mean_length": 204.484375, + "completions/mean_terminated_length": 191.3062515258789, + "completions/min_length": 57.5, + "completions/min_terminated_length": 57.5, + "epoch": 0.808, + "grad_norm": 3.527045965194702, + "kl": 22.53125, + "learning_rate": 2.173918431475861e-06, + "loss": 1.8978, + "num_tokens": 46811361.0, + "reward": 1.671875, + "reward_std": 0.7466835081577301, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38772592693567276, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.36478035151958466, + "step": 1616, + "token_counts/after_target": 475.5, + "token_counts/after_think": 64.0, + "token_counts/before_target": 1735.5, + "token_counts/before_think": 996.75 + }, + { + "avg_penalty/after_target": 2.48233562707901, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.37280935049057007, + "avg_penalty/before_think": 0.6025516204535961, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 542.0, + "completions/max_terminated_length": 421.75, + "completions/mean_length": 216.09375, + "completions/mean_terminated_length": 192.2857208251953, + "completions/min_length": 58.75, + "completions/min_terminated_length": 58.75, + "epoch": 0.8085, + "grad_norm": 8.645734786987305, + "kl": 23.03125, + "learning_rate": 2.163065426741603e-06, + "loss": 1.6992, + "num_tokens": 46835703.0, + "reward": 1.5, + "reward_std": 0.7923158705234528, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4380975142121315, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.37168944627046585, + "step": 1617, + "token_counts/after_target": 619.0, + "token_counts/after_think": 40.75, + "token_counts/before_target": 1937.25, + "token_counts/before_think": 860.5 + }, + { + "avg_penalty/after_target": 1.9641651213169098, + "avg_penalty/after_think": 3.9582449197769165, + "avg_penalty/before_target": 0.4654470533132553, + "avg_penalty/before_think": 0.3654923439025879, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.5, + "completions/max_terminated_length": 687.5, + "completions/mean_length": 208.640625, + "completions/mean_terminated_length": 208.640625, + "completions/min_length": 50.5, + "completions/min_terminated_length": 50.5, + "epoch": 0.809, + "grad_norm": 11.593254089355469, + "kl": 29.0, + "learning_rate": 2.15223629466917e-06, + "loss": 2.15, + "num_tokens": 46859504.0, + "reward": 1.41015625, + "reward_std": 0.8508396595716476, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.48935678601264954, + "rewards/tag_count_reward/mean": 0.75390625, + "rewards/tag_count_reward/std": 0.38996177166700363, + "step": 1618, + "token_counts/after_target": 661.75, + "token_counts/after_think": 34.0, + "token_counts/before_target": 2041.5, + "token_counts/before_think": 601.0 + }, + { + "avg_penalty/after_target": 3.3623892664909363, + "avg_penalty/after_think": 3.6528276205062866, + "avg_penalty/before_target": 0.30941691622138023, + "avg_penalty/before_think": 0.3531137481331825, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 638.0, + "completions/max_terminated_length": 557.5, + "completions/mean_length": 192.765625, + "completions/mean_terminated_length": 179.99479293823242, + "completions/min_length": 43.25, + "completions/min_terminated_length": 43.25, + "epoch": 0.8095, + "grad_norm": 4.736268520355225, + "kl": 22.84375, + "learning_rate": 2.1414310682459805e-06, + "loss": 1.9054, + "num_tokens": 46883073.0, + "reward": 1.5859375, + "reward_std": 0.7819357961416245, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.43303824216127396, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.3635328561067581, + "step": 1619, + "token_counts/after_target": 607.0, + "token_counts/after_think": 23.0, + "token_counts/before_target": 1506.75, + "token_counts/before_think": 947.5 + }, + { + "avg_penalty/after_target": 2.294194757938385, + "avg_penalty/after_think": 3.3527219593524933, + "avg_penalty/before_target": 0.38522762805223465, + "avg_penalty/before_think": 0.42460183799266815, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 629.0, + "completions/max_terminated_length": 507.5, + "completions/mean_length": 179.671875, + "completions/mean_terminated_length": 167.20729446411133, + "completions/min_length": 48.5, + "completions/min_terminated_length": 48.5, + "epoch": 0.81, + "grad_norm": 11.591878890991211, + "kl": 26.9375, + "learning_rate": 2.130649780386628e-06, + "loss": 1.984, + "num_tokens": 46904220.0, + "reward": 1.51953125, + "reward_std": 0.7687628120183945, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.41168536990880966, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.36828820407390594, + "step": 1620, + "token_counts/after_target": 366.75, + "token_counts/after_think": 20.75, + "token_counts/before_target": 1709.5, + "token_counts/before_think": 777.75 + }, + { + "avg_penalty/after_target": 2.640647977590561, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.35089702159166336, + "avg_penalty/before_think": 0.4590684473514557, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 667.25, + "completions/max_terminated_length": 667.25, + "completions/mean_length": 215.953125, + "completions/mean_terminated_length": 215.953125, + "completions/min_length": 52.25, + "completions/min_terminated_length": 52.25, + "epoch": 0.8105, + "grad_norm": 4.3325066566467285, + "kl": 20.390625, + "learning_rate": 2.119892463932781e-06, + "loss": 1.9062, + "num_tokens": 46927849.0, + "reward": 1.6328125, + "reward_std": 0.7503267079591751, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4097762927412987, + "rewards/tag_count_reward/mean": 0.8359375, + "rewards/tag_count_reward/std": 0.35507555305957794, + "step": 1621, + "token_counts/after_target": 523.75, + "token_counts/after_think": 74.25, + "token_counts/before_target": 1901.0, + "token_counts/before_think": 956.25 + }, + { + "avg_penalty/after_target": 2.952982783317566, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3969276323914528, + "avg_penalty/before_think": 0.4056353345513344, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.75, + "completions/max_terminated_length": 634.75, + "completions/mean_length": 217.328125, + "completions/mean_terminated_length": 217.328125, + "completions/min_length": 41.25, + "completions/min_terminated_length": 41.25, + "epoch": 0.811, + "grad_norm": 3.9891157150268555, + "kl": 23.40625, + "learning_rate": 2.1091591516530952e-06, + "loss": 2.0809, + "num_tokens": 46952414.0, + "reward": 1.5390625, + "reward_std": 0.7115332186222076, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.41956035792827606, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.3211289569735527, + "step": 1622, + "token_counts/after_target": 900.5, + "token_counts/after_think": 76.0, + "token_counts/before_target": 1437.25, + "token_counts/before_think": 1063.5 + }, + { + "avg_penalty/after_target": 2.0920362770557404, + "avg_penalty/after_think": 2.894971787929535, + "avg_penalty/before_target": 0.31703465431928635, + "avg_penalty/before_think": 0.44766855239868164, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 643.25, + "completions/max_terminated_length": 499.25, + "completions/mean_length": 174.234375, + "completions/mean_terminated_length": 161.24479293823242, + "completions/min_length": 38.25, + "completions/min_terminated_length": 38.25, + "epoch": 0.8115, + "grad_norm": 3.900827407836914, + "kl": 15.9921875, + "learning_rate": 2.098449876243096e-06, + "loss": 1.3546, + "num_tokens": 46974397.0, + "reward": 1.58984375, + "reward_std": 0.7388089895248413, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42733466625213623, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.34006384015083313, + "step": 1623, + "token_counts/after_target": 491.5, + "token_counts/after_think": 18.5, + "token_counts/before_target": 1276.5, + "token_counts/before_think": 1001.25 + }, + { + "avg_penalty/after_target": 2.532137840986252, + "avg_penalty/after_think": 2.6661354303359985, + "avg_penalty/before_target": 0.3415972515940666, + "avg_penalty/before_think": 0.2688799537718296, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.5, + "completions/max_terminated_length": 390.5, + "completions/mean_length": 131.484375, + "completions/mean_terminated_length": 131.484375, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.812, + "grad_norm": 5.6591315269470215, + "kl": 18.814453125, + "learning_rate": 2.0877646703251e-06, + "loss": 1.5355, + "num_tokens": 46992252.0, + "reward": 1.609375, + "reward_std": 0.6418961584568024, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.3454566150903702, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.3170805722475052, + "step": 1624, + "token_counts/after_target": 321.75, + "token_counts/after_think": 50.75, + "token_counts/before_target": 1153.75, + "token_counts/before_think": 577.5 + }, + { + "avg_penalty/after_target": 2.6295979619026184, + "avg_penalty/after_think": 3.778020739555359, + "avg_penalty/before_target": 0.3741345442831516, + "avg_penalty/before_think": 0.49148597568273544, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.25, + "completions/max_terminated_length": 520.25, + "completions/mean_length": 212.71875, + "completions/mean_terminated_length": 212.71875, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.8125, + "grad_norm": 3.77622389793396, + "kl": 19.59375, + "learning_rate": 2.0771035664480944e-06, + "loss": 1.6623, + "num_tokens": 47015114.0, + "reward": 1.58984375, + "reward_std": 0.7778589874505997, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.41194770485162735, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.37475188821554184, + "step": 1625, + "token_counts/after_target": 486.25, + "token_counts/after_think": 81.5, + "token_counts/before_target": 1834.5, + "token_counts/before_think": 1001.25 + }, + { + "avg_penalty/after_target": 2.562154322862625, + "avg_penalty/after_think": 2.864628314971924, + "avg_penalty/before_target": 0.3639131411910057, + "avg_penalty/before_think": 0.42169685289263725, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.5, + "completions/max_terminated_length": 499.5, + "completions/mean_length": 171.75, + "completions/mean_terminated_length": 171.75, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.813, + "grad_norm": 4.136168956756592, + "kl": 16.6484375, + "learning_rate": 2.0664665970876496e-06, + "loss": 1.4382, + "num_tokens": 47037530.0, + "reward": 1.63671875, + "reward_std": 0.7872204184532166, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.125, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4057852029800415, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.36761847138404846, + "step": 1626, + "token_counts/after_target": 424.5, + "token_counts/after_think": 29.75, + "token_counts/before_target": 1398.5, + "token_counts/before_think": 895.25 + }, + { + "avg_penalty/after_target": 2.48468554019928, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.5633483827114105, + "avg_penalty/before_think": 0.4574035480618477, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 223.21875, + "completions/mean_terminated_length": 223.21875, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.8135, + "grad_norm": 4.211803913116455, + "kl": 24.75, + "learning_rate": 2.0558537946458177e-06, + "loss": 2.0569, + "num_tokens": 47060328.0, + "reward": 1.43359375, + "reward_std": 0.8767912536859512, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4761601909995079, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.4205627366900444, + "step": 1627, + "token_counts/after_target": 786.5, + "token_counts/after_think": 132.0, + "token_counts/before_target": 1644.0, + "token_counts/before_think": 1009.0 + }, + { + "avg_penalty/after_target": 2.8550559878349304, + "avg_penalty/after_think": 3.971213400363922, + "avg_penalty/before_target": 0.3629944398999214, + "avg_penalty/before_think": 0.4605237767100334, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.0, + "completions/max_terminated_length": 593.0, + "completions/mean_length": 192.625, + "completions/mean_terminated_length": 192.625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.814, + "grad_norm": 8.030430793762207, + "kl": 19.2451171875, + "learning_rate": 2.0452651914510414e-06, + "loss": 1.5828, + "num_tokens": 47080352.0, + "reward": 1.61328125, + "reward_std": 0.6671986877918243, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.34854350984096527, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.30565066635608673, + "step": 1628, + "token_counts/after_target": 562.0, + "token_counts/after_think": 32.5, + "token_counts/before_target": 1730.25, + "token_counts/before_think": 757.25 + }, + { + "avg_penalty/after_target": 3.218951016664505, + "avg_penalty/after_think": 2.4920613169670105, + "avg_penalty/before_target": 0.3049762398004532, + "avg_penalty/before_think": 0.35120848193764687, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.25, + "completions/max_terminated_length": 555.25, + "completions/mean_length": 187.046875, + "completions/mean_terminated_length": 187.046875, + "completions/min_length": 36.5, + "completions/min_terminated_length": 36.5, + "epoch": 0.8145, + "grad_norm": 4.247961044311523, + "kl": 23.59375, + "learning_rate": 2.0347008197580376e-06, + "loss": 2.0568, + "num_tokens": 47104451.0, + "reward": 1.43359375, + "reward_std": 0.8453839421272278, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4581565484404564, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.40682120621204376, + "step": 1629, + "token_counts/after_target": 656.5, + "token_counts/after_think": 47.25, + "token_counts/before_target": 1514.0, + "token_counts/before_think": 775.0 + }, + { + "avg_penalty/after_target": 3.466451644897461, + "avg_penalty/after_think": 3.8763116002082825, + "avg_penalty/before_target": 0.31419995054602623, + "avg_penalty/before_think": 0.3638191409409046, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.75, + "completions/max_terminated_length": 443.75, + "completions/mean_length": 144.375, + "completions/mean_terminated_length": 144.375, + "completions/min_length": 41.75, + "completions/min_terminated_length": 41.75, + "epoch": 0.815, + "grad_norm": 7.554825305938721, + "kl": 17.796875, + "learning_rate": 2.024160711747717e-06, + "loss": 1.7938, + "num_tokens": 47122843.0, + "reward": 1.625, + "reward_std": 0.7330465614795685, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.38724804669618607, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.35440728068351746, + "step": 1630, + "token_counts/after_target": 420.25, + "token_counts/after_think": 77.5, + "token_counts/before_target": 964.5, + "token_counts/before_think": 847.75 + }, + { + "avg_penalty/after_target": 2.5740689039230347, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.49898749217391014, + "avg_penalty/before_think": 0.6019815355539322, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.5, + "completions/max_terminated_length": 576.5, + "completions/mean_length": 178.515625, + "completions/mean_terminated_length": 178.515625, + "completions/min_length": 52.75, + "completions/min_terminated_length": 52.75, + "epoch": 0.8155, + "grad_norm": 10.671710968017578, + "kl": 15.7265625, + "learning_rate": 2.013644899527074e-06, + "loss": 1.6697, + "num_tokens": 47145484.0, + "reward": 1.6484375, + "reward_std": 0.6902919411659241, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.42080147564411163, + "rewards/tag_count_reward/mean": 0.8671875, + "rewards/tag_count_reward/std": 0.28869710117578506, + "step": 1631, + "token_counts/after_target": 575.0, + "token_counts/after_think": 83.25, + "token_counts/before_target": 1215.75, + "token_counts/before_think": 982.25 + }, + { + "avg_penalty/after_target": 3.0645161867141724, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.38328464701771736, + "avg_penalty/before_think": 0.4525849148631096, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 602.5, + "completions/max_terminated_length": 480.75, + "completions/mean_length": 176.34375, + "completions/mean_terminated_length": 163.19166946411133, + "completions/min_length": 21.75, + "completions/min_terminated_length": 21.75, + "epoch": 0.816, + "grad_norm": 7.388377666473389, + "kl": 22.1875, + "learning_rate": 2.0031534151290944e-06, + "loss": 2.0805, + "num_tokens": 47169026.0, + "reward": 1.515625, + "reward_std": 0.8358447402715683, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4519384130835533, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.40064916759729385, + "step": 1632, + "token_counts/after_target": 677.0, + "token_counts/after_think": 122.25, + "token_counts/before_target": 1344.75, + "token_counts/before_think": 677.5 + }, + { + "avg_penalty/after_target": 2.3003556430339813, + "avg_penalty/after_think": 2.7420259714126587, + "avg_penalty/before_target": 0.5270478576421738, + "avg_penalty/before_think": 0.5019631162285805, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 754.25, + "completions/max_terminated_length": 730.5, + "completions/mean_length": 226.484375, + "completions/mean_terminated_length": 213.0718765258789, + "completions/min_length": 47.5, + "completions/min_terminated_length": 47.5, + "epoch": 0.8165, + "grad_norm": 11.657817840576172, + "kl": 36.75, + "learning_rate": 1.9926862905126663e-06, + "loss": 2.7331, + "num_tokens": 47191537.0, + "reward": 1.2421875, + "reward_std": 0.912707656621933, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.59375, + "rewards/format_reward/std": 0.48866813629865646, + "rewards/tag_count_reward/mean": 0.6484375, + "rewards/tag_count_reward/std": 0.4490017890930176, + "step": 1633, + "token_counts/after_target": 988.75, + "token_counts/after_think": 19.0, + "token_counts/before_target": 1911.5, + "token_counts/before_think": 704.5 + }, + { + "avg_penalty/after_target": 3.146216630935669, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.26964695379137993, + "avg_penalty/before_think": 0.3768555633723736, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.25, + "completions/max_terminated_length": 502.25, + "completions/mean_length": 158.703125, + "completions/mean_terminated_length": 158.703125, + "completions/min_length": 37.75, + "completions/min_terminated_length": 37.75, + "epoch": 0.817, + "grad_norm": 6.625213146209717, + "kl": 20.6015625, + "learning_rate": 1.982243557562461e-06, + "loss": 1.9366, + "num_tokens": 47213086.0, + "reward": 1.640625, + "reward_std": 0.7375327348709106, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3837348371744156, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.3598443791270256, + "step": 1634, + "token_counts/after_target": 412.0, + "token_counts/after_think": 121.0, + "token_counts/before_target": 1324.75, + "token_counts/before_think": 681.5 + }, + { + "avg_penalty/after_target": 2.6897498965263367, + "avg_penalty/after_think": 3.9539977312088013, + "avg_penalty/before_target": 0.3916356712579727, + "avg_penalty/before_think": 0.45187705755233765, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.75, + "completions/max_terminated_length": 379.75, + "completions/mean_length": 141.40625, + "completions/mean_terminated_length": 141.40625, + "completions/min_length": 44.5, + "completions/min_terminated_length": 44.5, + "epoch": 0.8175, + "grad_norm": 9.445015907287598, + "kl": 15.4921875, + "learning_rate": 1.9718252480888567e-06, + "loss": 1.6107, + "num_tokens": 47231464.0, + "reward": 1.73046875, + "reward_std": 0.6481740176677704, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36483466625213623, + "rewards/tag_count_reward/mean": 0.88671875, + "rewards/tag_count_reward/std": 0.3033241629600525, + "step": 1635, + "token_counts/after_target": 381.0, + "token_counts/after_think": 103.25, + "token_counts/before_target": 1042.75, + "token_counts/before_think": 735.5 + }, + { + "avg_penalty/after_target": 2.498003751039505, + "avg_penalty/after_think": 3.7473491430282593, + "avg_penalty/before_target": 0.36979425325989723, + "avg_penalty/before_think": 0.3671356439590454, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.25, + "completions/max_terminated_length": 451.25, + "completions/mean_length": 130.796875, + "completions/mean_terminated_length": 130.796875, + "completions/min_length": 42.5, + "completions/min_terminated_length": 42.5, + "epoch": 0.818, + "grad_norm": 5.01133394241333, + "kl": 25.71875, + "learning_rate": 1.961431393827827e-06, + "loss": 2.0651, + "num_tokens": 47247707.0, + "reward": 1.5859375, + "reward_std": 0.8129734694957733, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.42516325414180756, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.39340896904468536, + "step": 1636, + "token_counts/after_target": 265.5, + "token_counts/after_think": 47.75, + "token_counts/before_target": 1145.5, + "token_counts/before_think": 634.0 + }, + { + "avg_penalty/after_target": 2.6357903480529785, + "avg_penalty/after_think": 2.4517091512680054, + "avg_penalty/before_target": 0.2560790739953518, + "avg_penalty/before_think": 0.3855675049126148, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.75, + "completions/max_terminated_length": 364.75, + "completions/mean_length": 159.109375, + "completions/mean_terminated_length": 159.109375, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.8185, + "grad_norm": 2.5345921516418457, + "kl": 11.1875, + "learning_rate": 1.95106202644086e-06, + "loss": 0.9547, + "num_tokens": 47266930.0, + "reward": 1.609375, + "reward_std": 0.7568329423666, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.351981557905674, + "step": 1637, + "token_counts/after_target": 349.0, + "token_counts/after_think": 17.25, + "token_counts/before_target": 1205.25, + "token_counts/before_think": 974.25 + }, + { + "avg_penalty/after_target": 2.48984557390213, + "avg_penalty/after_think": 3.5097422003746033, + "avg_penalty/before_target": 0.2928808219730854, + "avg_penalty/before_think": 0.3809570372104645, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.25, + "completions/max_terminated_length": 582.25, + "completions/mean_length": 156.65625, + "completions/mean_terminated_length": 156.65625, + "completions/min_length": 30.75, + "completions/min_terminated_length": 30.75, + "epoch": 0.819, + "grad_norm": 4.226430892944336, + "kl": 22.21875, + "learning_rate": 1.940717177514844e-06, + "loss": 1.8917, + "num_tokens": 47290236.0, + "reward": 1.58203125, + "reward_std": 0.8121157288551331, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.42516325414180756, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.389815591275692, + "step": 1638, + "token_counts/after_target": 404.0, + "token_counts/after_think": 34.25, + "token_counts/before_target": 1412.0, + "token_counts/before_think": 656.25 + }, + { + "avg_penalty/after_target": 2.5133865773677826, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3907612934708595, + "avg_penalty/before_think": 0.7807877734303474, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 691.0, + "completions/max_terminated_length": 691.0, + "completions/mean_length": 227.421875, + "completions/mean_terminated_length": 227.421875, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.8195, + "grad_norm": 8.361684799194336, + "kl": 27.828125, + "learning_rate": 1.930396878561983e-06, + "loss": 2.1424, + "num_tokens": 47313895.0, + "reward": 1.34765625, + "reward_std": 0.8778918236494064, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.47669370472431183, + "rewards/tag_count_reward/mean": 0.70703125, + "rewards/tag_count_reward/std": 0.42166154086589813, + "step": 1639, + "token_counts/after_target": 702.25, + "token_counts/after_think": 212.5, + "token_counts/before_target": 2186.75, + "token_counts/before_think": 537.25 + }, + { + "avg_penalty/after_target": 2.404266357421875, + "avg_penalty/after_think": 3.065831035375595, + "avg_penalty/before_target": 0.2911403700709343, + "avg_penalty/before_think": 0.38023925572633743, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.5, + "completions/max_terminated_length": 592.5, + "completions/mean_length": 194.671875, + "completions/mean_terminated_length": 194.671875, + "completions/min_length": 44.25, + "completions/min_terminated_length": 44.25, + "epoch": 0.82, + "grad_norm": 15.923394203186035, + "kl": 31.65625, + "learning_rate": 1.9201011610196972e-06, + "loss": 2.1745, + "num_tokens": 47334050.0, + "reward": 1.33984375, + "reward_std": 0.9172898977994919, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.48989029973745346, + "rewards/tag_count_reward/mean": 0.69921875, + "rewards/tag_count_reward/std": 0.44300852715969086, + "step": 1640, + "token_counts/after_target": 400.5, + "token_counts/after_think": 18.75, + "token_counts/before_target": 2283.75, + "token_counts/before_think": 411.75 + }, + { + "avg_penalty/after_target": 3.1918352246284485, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.280749149620533, + "avg_penalty/before_think": 0.45674148201942444, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.5, + "completions/max_terminated_length": 502.5, + "completions/mean_length": 169.5625, + "completions/mean_terminated_length": 169.5625, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.8205, + "grad_norm": 4.063378810882568, + "kl": 17.0, + "learning_rate": 1.9098300562505266e-06, + "loss": 1.6125, + "num_tokens": 47356086.0, + "reward": 1.73046875, + "reward_std": 0.6107901334762573, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.3604728877544403, + "rewards/tag_count_reward/mean": 0.88671875, + "rewards/tag_count_reward/std": 0.25981205701828003, + "step": 1641, + "token_counts/after_target": 389.75, + "token_counts/after_think": 28.75, + "token_counts/before_target": 1695.5, + "token_counts/before_think": 599.0 + }, + { + "avg_penalty/after_target": 2.206370323896408, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3989289030432701, + "avg_penalty/before_think": 0.43343891203403473, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.5, + "completions/max_terminated_length": 687.5, + "completions/mean_length": 228.375, + "completions/mean_terminated_length": 228.375, + "completions/min_length": 50.75, + "completions/min_terminated_length": 50.75, + "epoch": 0.821, + "grad_norm": 3.392768144607544, + "kl": 26.46875, + "learning_rate": 1.8995835955420417e-06, + "loss": 2.2695, + "num_tokens": 47378846.0, + "reward": 1.58203125, + "reward_std": 0.7990159690380096, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.3884936347603798, + "step": 1642, + "token_counts/after_target": 787.0, + "token_counts/after_think": 213.0, + "token_counts/before_target": 1931.75, + "token_counts/before_think": 722.25 + }, + { + "avg_penalty/after_target": 2.6372052431106567, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.37677332386374474, + "avg_penalty/before_think": 0.542002446949482, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 726.0, + "completions/max_terminated_length": 707.5, + "completions/mean_length": 207.84375, + "completions/mean_terminated_length": 195.44166946411133, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.8215, + "grad_norm": 6.90211820602417, + "kl": 22.8125, + "learning_rate": 1.8893618101067357e-06, + "loss": 2.1114, + "num_tokens": 47403028.0, + "reward": 1.66015625, + "reward_std": 0.7274613827466965, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.39123913645744324, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.34717122465372086, + "step": 1643, + "token_counts/after_target": 682.0, + "token_counts/after_think": 45.75, + "token_counts/before_target": 1668.5, + "token_counts/before_think": 929.25 + }, + { + "avg_penalty/after_target": 3.2677259743213654, + "avg_penalty/after_think": 1.6677085161209106, + "avg_penalty/before_target": 0.1861085742712021, + "avg_penalty/before_think": 0.3081599958240986, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.5, + "completions/max_terminated_length": 407.5, + "completions/mean_length": 136.03125, + "completions/mean_terminated_length": 136.03125, + "completions/min_length": 41.5, + "completions/min_terminated_length": 41.5, + "epoch": 0.822, + "grad_norm": 7.920285701751709, + "kl": 29.15625, + "learning_rate": 1.8791647310819371e-06, + "loss": 2.2152, + "num_tokens": 47422454.0, + "reward": 1.4375, + "reward_std": 0.9053324908018112, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.47083858400583267, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.4477841481566429, + "step": 1644, + "token_counts/after_target": 340.5, + "token_counts/after_think": 6.0, + "token_counts/before_target": 1282.5, + "token_counts/before_think": 547.5 + }, + { + "avg_penalty/after_target": 2.454155534505844, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.36797547712922096, + "avg_penalty/before_think": 0.4425608888268471, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 685.0, + "completions/max_terminated_length": 600.75, + "completions/mean_length": 227.65625, + "completions/mean_terminated_length": 215.07917022705078, + "completions/min_length": 63.5, + "completions/min_terminated_length": 63.5, + "epoch": 0.8225, + "grad_norm": 5.690909385681152, + "kl": 20.9375, + "learning_rate": 1.8689923895297247e-06, + "loss": 1.9687, + "num_tokens": 47445456.0, + "reward": 1.765625, + "reward_std": 0.8556950241327286, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.12909944355487823, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.40311288833618164, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.3719746544957161, + "step": 1645, + "token_counts/after_target": 681.0, + "token_counts/after_think": 81.5, + "token_counts/before_target": 1878.75, + "token_counts/before_think": 1001.25 + }, + { + "avg_penalty/after_target": 2.8253875374794006, + "avg_penalty/after_think": 1.8130862712860107, + "avg_penalty/before_target": 0.3788605220615864, + "avg_penalty/before_think": 0.4133627861738205, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 600.5, + "completions/max_terminated_length": 544.5, + "completions/mean_length": 175.6875, + "completions/mean_terminated_length": 163.6510467529297, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.823, + "grad_norm": 3.8722081184387207, + "kl": 27.59375, + "learning_rate": 1.858844816436809e-06, + "loss": 2.401, + "num_tokens": 47466124.0, + "reward": 1.51171875, + "reward_std": 0.8552558124065399, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4383598491549492, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.41954080015420914, + "step": 1646, + "token_counts/after_target": 731.0, + "token_counts/after_think": 13.75, + "token_counts/before_target": 1268.0, + "token_counts/before_think": 798.25 + }, + { + "avg_penalty/after_target": 3.2459834814071655, + "avg_penalty/after_think": 2.6625680923461914, + "avg_penalty/before_target": 0.326738677918911, + "avg_penalty/before_think": 0.3334622420370579, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.25, + "completions/max_terminated_length": 579.25, + "completions/mean_length": 188.828125, + "completions/mean_terminated_length": 188.828125, + "completions/min_length": 44.5, + "completions/min_terminated_length": 44.5, + "epoch": 0.8235, + "grad_norm": 7.3043622970581055, + "kl": 31.0625, + "learning_rate": 1.848722042714457e-06, + "loss": 2.4127, + "num_tokens": 47487553.0, + "reward": 1.4453125, + "reward_std": 0.8778593689203262, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4682852029800415, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.42421380430459976, + "step": 1647, + "token_counts/after_target": 560.25, + "token_counts/after_think": 41.75, + "token_counts/before_target": 1808.75, + "token_counts/before_think": 610.5 + }, + { + "avg_penalty/after_target": 2.1191426217556, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4596252217888832, + "avg_penalty/before_think": 0.4359416663646698, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 575.0, + "completions/max_terminated_length": 575.0, + "completions/mean_length": 197.28125, + "completions/mean_terminated_length": 197.28125, + "completions/min_length": 42.75, + "completions/min_terminated_length": 42.75, + "epoch": 0.824, + "grad_norm": 3.3577160835266113, + "kl": 22.65625, + "learning_rate": 1.8386240991983973e-06, + "loss": 1.9033, + "num_tokens": 47514563.0, + "reward": 1.6171875, + "reward_std": 0.8857776373624802, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.11180340498685837, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42867646366357803, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.40493375808000565, + "step": 1648, + "token_counts/after_target": 676.5, + "token_counts/after_think": 33.75, + "token_counts/before_target": 1711.0, + "token_counts/before_think": 735.25 + }, + { + "avg_penalty/after_target": 2.56948259472847, + "avg_penalty/after_think": 2.8317238688468933, + "avg_penalty/before_target": 0.3417443484067917, + "avg_penalty/before_think": 0.32607462257146835, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.75, + "completions/max_terminated_length": 449.75, + "completions/mean_length": 133.734375, + "completions/mean_terminated_length": 133.734375, + "completions/min_length": 39.75, + "completions/min_terminated_length": 39.75, + "epoch": 0.8245, + "grad_norm": 3.1251747608184814, + "kl": 22.953125, + "learning_rate": 1.8285510166487154e-06, + "loss": 1.8988, + "num_tokens": 47533330.0, + "reward": 1.640625, + "reward_std": 0.7457688003778458, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.38688503205776215, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.3620058596134186, + "step": 1649, + "token_counts/after_target": 351.75, + "token_counts/after_think": 30.75, + "token_counts/before_target": 1196.25, + "token_counts/before_think": 561.0 + }, + { + "avg_penalty/after_target": 2.4985541999340057, + "avg_penalty/after_think": 2.8519392609596252, + "avg_penalty/before_target": 0.3872176222503185, + "avg_penalty/before_think": 0.4281659796833992, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 767.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 166.9375, + "completions/mean_terminated_length": 154.02187728881836, + "completions/min_length": 41.25, + "completions/min_terminated_length": 41.25, + "epoch": 0.825, + "grad_norm": 12.535480499267578, + "kl": 26.4375, + "learning_rate": 1.818502825749764e-06, + "loss": 2.6552, + "num_tokens": 47553998.0, + "reward": 1.734375, + "reward_std": 0.6700502783060074, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3450859263539314, + "rewards/tag_count_reward/mean": 0.875, + "rewards/tag_count_reward/std": 0.33406074345111847, + "step": 1650, + "token_counts/after_target": 523.5, + "token_counts/after_think": 78.0, + "token_counts/before_target": 1428.75, + "token_counts/before_think": 640.75 + }, + { + "avg_penalty/after_target": 3.2246845960617065, + "avg_penalty/after_think": 2.905808389186859, + "avg_penalty/before_target": 0.31032101064920425, + "avg_penalty/before_think": 0.5786322057247162, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.25, + "completions/max_terminated_length": 516.25, + "completions/mean_length": 165.21875, + "completions/mean_terminated_length": 165.21875, + "completions/min_length": 54.5, + "completions/min_terminated_length": 54.5, + "epoch": 0.8255, + "grad_norm": 13.258946418762207, + "kl": 15.0625, + "learning_rate": 1.808479557110081e-06, + "loss": 1.7238, + "num_tokens": 47575004.0, + "reward": 1.671875, + "reward_std": 0.6903320848941803, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3837348371744156, + "rewards/tag_count_reward/mean": 0.859375, + "rewards/tag_count_reward/std": 0.31941793113946915, + "step": 1651, + "token_counts/after_target": 480.25, + "token_counts/after_think": 140.75, + "token_counts/before_target": 1256.5, + "token_counts/before_think": 766.0 + }, + { + "avg_penalty/after_target": 2.8439339995384216, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.519237220287323, + "avg_penalty/before_think": 0.3355007730424404, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 843.25, + "completions/max_terminated_length": 843.25, + "completions/mean_length": 261.015625, + "completions/mean_terminated_length": 261.015625, + "completions/min_length": 53.5, + "completions/min_terminated_length": 53.5, + "epoch": 0.826, + "grad_norm": 6.965649127960205, + "kl": 26.71875, + "learning_rate": 1.7984812412622787e-06, + "loss": 2.4278, + "num_tokens": 47600845.0, + "reward": 1.5390625, + "reward_std": 0.8248428702354431, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4440634250640869, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.3982383906841278, + "step": 1652, + "token_counts/after_target": 1116.0, + "token_counts/after_think": 92.5, + "token_counts/before_target": 2109.25, + "token_counts/before_think": 858.5 + }, + { + "avg_penalty/after_target": 3.2055450677871704, + "avg_penalty/after_think": 2.698945462703705, + "avg_penalty/before_target": 0.36900854855775833, + "avg_penalty/before_think": 0.43182215094566345, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 604.5, + "completions/max_terminated_length": 604.5, + "completions/mean_length": 167.78125, + "completions/mean_terminated_length": 167.78125, + "completions/min_length": 39.5, + "completions/min_terminated_length": 39.5, + "epoch": 0.8265, + "grad_norm": 3.5789153575897217, + "kl": 28.84375, + "learning_rate": 1.7885079086629598e-06, + "loss": 2.4923, + "num_tokens": 47621871.0, + "reward": 1.4921875, + "reward_std": 0.8741040080785751, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.45508860796689987, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.42485617846250534, + "step": 1653, + "token_counts/after_target": 744.75, + "token_counts/after_think": 29.25, + "token_counts/before_target": 1472.0, + "token_counts/before_think": 438.5 + }, + { + "avg_penalty/after_target": 2.5132459700107574, + "avg_penalty/after_think": 3.5611424446105957, + "avg_penalty/before_target": 0.3620757535099983, + "avg_penalty/before_think": 0.5145538374781609, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.25, + "completions/max_terminated_length": 443.25, + "completions/mean_length": 145.734375, + "completions/mean_terminated_length": 145.734375, + "completions/min_length": 40.25, + "completions/min_terminated_length": 40.25, + "epoch": 0.827, + "grad_norm": 4.745544910430908, + "kl": 20.921875, + "learning_rate": 1.7785595896926267e-06, + "loss": 1.7037, + "num_tokens": 47638894.0, + "reward": 1.5625, + "reward_std": 0.7793950140476227, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.41110680997371674, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.38063329458236694, + "step": 1654, + "token_counts/after_target": 353.75, + "token_counts/after_think": 57.25, + "token_counts/before_target": 1304.25, + "token_counts/before_think": 616.5 + }, + { + "avg_penalty/after_target": 1.8500245809555054, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.35498809069395065, + "avg_penalty/before_think": 0.42333008348941803, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.0, + "completions/max_terminated_length": 607.0, + "completions/mean_length": 180.734375, + "completions/mean_terminated_length": 180.734375, + "completions/min_length": 31.5, + "completions/min_terminated_length": 31.5, + "epoch": 0.8275, + "grad_norm": 13.060548782348633, + "kl": 26.0, + "learning_rate": 1.7686363146555807e-06, + "loss": 1.8168, + "num_tokens": 47660957.0, + "reward": 1.48046875, + "reward_std": 0.8742199838161469, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4519384130835533, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.4289069399237633, + "step": 1655, + "token_counts/after_target": 334.5, + "token_counts/after_think": 17.5, + "token_counts/before_target": 1859.25, + "token_counts/before_think": 680.5 + }, + { + "avg_penalty/after_target": 2.316864460706711, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4923650696873665, + "avg_penalty/before_think": 0.4016845151782036, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 683.0, + "completions/max_terminated_length": 538.25, + "completions/mean_length": 202.671875, + "completions/mean_terminated_length": 189.2427101135254, + "completions/min_length": 43.75, + "completions/min_terminated_length": 43.75, + "epoch": 0.828, + "grad_norm": 5.526986122131348, + "kl": 25.15625, + "learning_rate": 1.7587381137798432e-06, + "loss": 2.3325, + "num_tokens": 47684184.0, + "reward": 1.66015625, + "reward_std": 0.7358910888433456, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3987511098384857, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.3491506353020668, + "step": 1656, + "token_counts/after_target": 737.5, + "token_counts/after_think": 60.25, + "token_counts/before_target": 1780.25, + "token_counts/before_think": 664.75 + }, + { + "avg_penalty/after_target": 2.2536925971508026, + "avg_penalty/after_think": 3.7481738328933716, + "avg_penalty/before_target": 0.3866269327700138, + "avg_penalty/before_think": 0.421258881688118, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.25, + "completions/max_terminated_length": 647.25, + "completions/mean_length": 182.125, + "completions/mean_terminated_length": 182.125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.8285, + "grad_norm": 10.864778518676758, + "kl": 29.609375, + "learning_rate": 1.7488650172170496e-06, + "loss": 2.1895, + "num_tokens": 47705232.0, + "reward": 1.46484375, + "reward_std": 0.853804275393486, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.44495995342731476, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.41612908244132996, + "step": 1657, + "token_counts/after_target": 523.0, + "token_counts/after_think": 54.5, + "token_counts/before_target": 1781.25, + "token_counts/before_think": 555.25 + }, + { + "avg_penalty/after_target": 2.2104072272777557, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.5231677666306496, + "avg_penalty/before_think": 0.3418773263692856, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 704.75, + "completions/max_terminated_length": 693.5, + "completions/mean_length": 184.546875, + "completions/mean_terminated_length": 171.5500030517578, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.829, + "grad_norm": 4.171438217163086, + "kl": 29.0625, + "learning_rate": 1.7390170550423624e-06, + "loss": 2.4226, + "num_tokens": 47729683.0, + "reward": 1.5390625, + "reward_std": 0.8725423067808151, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4440634250640869, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.41784074157476425, + "step": 1658, + "token_counts/after_target": 640.75, + "token_counts/after_think": 43.5, + "token_counts/before_target": 1531.25, + "token_counts/before_think": 737.25 + }, + { + "avg_penalty/after_target": 1.6778708398342133, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.38974185660481453, + "avg_penalty/before_think": 0.3485250473022461, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.75, + "completions/max_terminated_length": 514.75, + "completions/mean_length": 160.828125, + "completions/mean_terminated_length": 160.828125, + "completions/min_length": 33.75, + "completions/min_terminated_length": 33.75, + "epoch": 0.8295, + "grad_norm": 4.862649440765381, + "kl": 23.171875, + "learning_rate": 1.7291942572543806e-06, + "loss": 1.8859, + "num_tokens": 47748120.0, + "reward": 1.5625, + "reward_std": 0.7691433429718018, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4308478757739067, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.3636437729001045, + "step": 1659, + "token_counts/after_target": 291.25, + "token_counts/after_think": 41.5, + "token_counts/before_target": 1589.5, + "token_counts/before_think": 651.0 + }, + { + "avg_penalty/after_target": 2.2796087861061096, + "avg_penalty/after_think": 3.649962842464447, + "avg_penalty/before_target": 0.35430749505758286, + "avg_penalty/before_think": 0.5374706834554672, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 649.25, + "completions/max_terminated_length": 649.25, + "completions/mean_length": 193.6875, + "completions/mean_terminated_length": 193.6875, + "completions/min_length": 44.5, + "completions/min_terminated_length": 44.5, + "epoch": 0.83, + "grad_norm": 7.3134026527404785, + "kl": 19.171875, + "learning_rate": 1.7193966537750561e-06, + "loss": 1.9041, + "num_tokens": 47769428.0, + "reward": 1.73046875, + "reward_std": 0.6049162447452545, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.30233466625213623, + "rewards/tag_count_reward/mean": 0.87109375, + "rewards/tag_count_reward/std": 0.31229958683252335, + "step": 1660, + "token_counts/after_target": 410.75, + "token_counts/after_think": 101.75, + "token_counts/before_target": 1826.0, + "token_counts/before_think": 760.5 + }, + { + "avg_penalty/after_target": 2.7643285393714905, + "avg_penalty/after_think": 2.7181060314178467, + "avg_penalty/before_target": 0.36471424624323845, + "avg_penalty/before_think": 0.3351431041955948, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 606.75, + "completions/max_terminated_length": 606.75, + "completions/mean_length": 190.625, + "completions/mean_terminated_length": 190.625, + "completions/min_length": 46.25, + "completions/min_terminated_length": 46.25, + "epoch": 0.8305, + "grad_norm": 5.065322399139404, + "kl": 25.515625, + "learning_rate": 1.709624274449584e-06, + "loss": 2.0918, + "num_tokens": 47792332.0, + "reward": 1.5625, + "reward_std": 0.7721997201442719, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4079566150903702, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.3702521026134491, + "step": 1661, + "token_counts/after_target": 572.75, + "token_counts/after_think": 19.25, + "token_counts/before_target": 1847.75, + "token_counts/before_think": 610.25 + }, + { + "avg_penalty/after_target": 2.291781336069107, + "avg_penalty/after_think": 2.6491145491600037, + "avg_penalty/before_target": 0.3260641470551491, + "avg_penalty/before_think": 0.43572432547807693, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 570.5, + "completions/max_terminated_length": 458.25, + "completions/mean_length": 168.5625, + "completions/mean_terminated_length": 155.1895866394043, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.831, + "grad_norm": 2.7826039791107178, + "kl": 15.28515625, + "learning_rate": 1.6998771490463262e-06, + "loss": 1.4654, + "num_tokens": 47812672.0, + "reward": 1.7578125, + "reward_std": 0.5255059748888016, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.2979728877544403, + "rewards/tag_count_reward/mean": 0.8984375, + "rewards/tag_count_reward/std": 0.24322887510061264, + "step": 1662, + "token_counts/after_target": 401.0, + "token_counts/after_think": 23.0, + "token_counts/before_target": 1265.0, + "token_counts/before_think": 1008.0 + }, + { + "avg_penalty/after_target": 2.6460792422294617, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3430255725979805, + "avg_penalty/before_think": 0.39836038649082184, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.75, + "completions/max_terminated_length": 607.75, + "completions/mean_length": 187.8125, + "completions/mean_terminated_length": 187.8125, + "completions/min_length": 44.5, + "completions/min_terminated_length": 44.5, + "epoch": 0.8315, + "grad_norm": 2.199462652206421, + "kl": 16.708984375, + "learning_rate": 1.6901553072567189e-06, + "loss": 1.5437, + "num_tokens": 47833924.0, + "reward": 1.84375, + "reward_std": 0.5157571136951447, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.11967839300632477, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.2750816270709038, + "rewards/tag_count_reward/mean": 0.890625, + "rewards/tag_count_reward/std": 0.2302793487906456, + "step": 1663, + "token_counts/after_target": 430.0, + "token_counts/after_think": 156.5, + "token_counts/before_target": 1705.5, + "token_counts/before_think": 713.0 + }, + { + "avg_penalty/after_target": 2.3266715109348297, + "avg_penalty/after_think": 3.8538554310798645, + "avg_penalty/before_target": 0.42623286321759224, + "avg_penalty/before_think": 0.3352869153022766, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 144.90625, + "completions/mean_terminated_length": 144.90625, + "completions/min_length": 31.5, + "completions/min_terminated_length": 31.5, + "epoch": 0.832, + "grad_norm": 3.643171787261963, + "kl": 23.8125, + "learning_rate": 1.6804587786951744e-06, + "loss": 1.9867, + "num_tokens": 47853358.0, + "reward": 1.58984375, + "reward_std": 0.7875443249940872, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.3806067779660225, + "step": 1664, + "token_counts/after_target": 411.75, + "token_counts/after_think": 32.75, + "token_counts/before_target": 1295.5, + "token_counts/before_think": 578.5 + }, + { + "avg_penalty/after_target": 3.1873629689216614, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3336855359375477, + "avg_penalty/before_think": 0.4808916300535202, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 182.875, + "completions/mean_terminated_length": 182.875, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.8325, + "grad_norm": 8.327071189880371, + "kl": 16.91015625, + "learning_rate": 1.6707875928990059e-06, + "loss": 1.7512, + "num_tokens": 47873718.0, + "reward": 1.65625, + "reward_std": 0.6986647546291351, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.38688503205776215, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.32392416149377823, + "step": 1665, + "token_counts/after_target": 653.75, + "token_counts/after_think": 30.75, + "token_counts/before_target": 1540.5, + "token_counts/before_think": 701.0 + }, + { + "avg_penalty/after_target": 2.3045083582401276, + "avg_penalty/after_think": 3.826586425304413, + "avg_penalty/before_target": 0.2774486541748047, + "avg_penalty/before_think": 0.43114980310201645, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.5, + "completions/max_terminated_length": 366.5, + "completions/mean_length": 138.203125, + "completions/mean_terminated_length": 138.203125, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.833, + "grad_norm": 4.499702453613281, + "kl": 10.7421875, + "learning_rate": 1.6611417793283192e-06, + "loss": 1.0539, + "num_tokens": 47891011.0, + "reward": 1.71875, + "reward_std": 0.6936421692371368, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.3723389655351639, + "rewards/tag_count_reward/mean": 0.875, + "rewards/tag_count_reward/std": 0.3415650427341461, + "step": 1666, + "token_counts/after_target": 279.75, + "token_counts/after_think": 17.5, + "token_counts/before_target": 864.5, + "token_counts/before_think": 1049.5 + }, + { + "avg_penalty/after_target": 2.7682949602603912, + "avg_penalty/after_think": 1.69840008020401, + "avg_penalty/before_target": 0.30197740718722343, + "avg_penalty/before_think": 0.3246425464749336, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.5, + "completions/max_terminated_length": 367.5, + "completions/mean_length": 125.703125, + "completions/mean_terminated_length": 125.703125, + "completions/min_length": 48.25, + "completions/min_terminated_length": 48.25, + "epoch": 0.8335, + "grad_norm": 4.488097190856934, + "kl": 11.421875, + "learning_rate": 1.651521367365936e-06, + "loss": 1.0695, + "num_tokens": 47908208.0, + "reward": 1.76953125, + "reward_std": 0.6568455100059509, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.356952004134655, + "rewards/tag_count_reward/mean": 0.89453125, + "rewards/tag_count_reward/std": 0.2991661876440048, + "step": 1667, + "token_counts/after_target": 241.25, + "token_counts/after_think": 17.75, + "token_counts/before_target": 822.0, + "token_counts/before_think": 930.25 + }, + { + "avg_penalty/after_target": 2.137868344783783, + "avg_penalty/after_think": 3.490777373313904, + "avg_penalty/before_target": 0.31181248649954796, + "avg_penalty/before_think": 0.45826756209135056, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.75, + "completions/max_terminated_length": 562.75, + "completions/mean_length": 173.71875, + "completions/mean_terminated_length": 173.71875, + "completions/min_length": 32.5, + "completions/min_terminated_length": 32.5, + "epoch": 0.834, + "grad_norm": 7.847002029418945, + "kl": 16.6484375, + "learning_rate": 1.6419263863172997e-06, + "loss": 1.2115, + "num_tokens": 47929198.0, + "reward": 1.63671875, + "reward_std": 0.7381281554698944, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.83984375, + "rewards/tag_count_reward/std": 0.34477680176496506, + "step": 1668, + "token_counts/after_target": 204.75, + "token_counts/after_think": 80.0, + "token_counts/before_target": 1347.5, + "token_counts/before_think": 1147.25 + }, + { + "avg_penalty/after_target": 2.9574612379074097, + "avg_penalty/after_think": 1.7181621193885803, + "avg_penalty/before_target": 0.23195134475827217, + "avg_penalty/before_think": 0.40715817362070084, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.5, + "completions/max_terminated_length": 451.5, + "completions/mean_length": 156.296875, + "completions/mean_terminated_length": 156.296875, + "completions/min_length": 40.5, + "completions/min_terminated_length": 40.5, + "epoch": 0.8345, + "grad_norm": 4.244269847869873, + "kl": 18.23046875, + "learning_rate": 1.6323568654103838e-06, + "loss": 1.5265, + "num_tokens": 47949377.0, + "reward": 1.61328125, + "reward_std": 0.819975420832634, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.14789126068353653, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4079566150903702, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.37117981165647507, + "step": 1669, + "token_counts/after_target": 474.0, + "token_counts/after_think": 43.0, + "token_counts/before_target": 1270.75, + "token_counts/before_think": 713.0 + }, + { + "avg_penalty/after_target": 2.3539429903030396, + "avg_penalty/after_think": 3.169430911540985, + "avg_penalty/before_target": 0.434185978025198, + "avg_penalty/before_think": 0.31031276658177376, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 643.25, + "completions/max_terminated_length": 479.75, + "completions/mean_length": 158.625, + "completions/mean_terminated_length": 144.29687690734863, + "completions/min_length": 33.25, + "completions/min_terminated_length": 33.25, + "epoch": 0.835, + "grad_norm": 4.185312747955322, + "kl": 25.25, + "learning_rate": 1.6228128337956128e-06, + "loss": 2.2585, + "num_tokens": 47968313.0, + "reward": 1.62109375, + "reward_std": 0.7431870847940445, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.37366948276758194, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.3778417482972145, + "step": 1670, + "token_counts/after_target": 479.0, + "token_counts/after_think": 22.25, + "token_counts/before_target": 1424.25, + "token_counts/before_think": 612.5 + }, + { + "avg_penalty/after_target": 2.6037160754203796, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.38605744391679764, + "avg_penalty/before_think": 0.45618586987257004, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.25, + "completions/max_terminated_length": 500.25, + "completions/mean_length": 207.03125, + "completions/mean_terminated_length": 207.03125, + "completions/min_length": 50.25, + "completions/min_terminated_length": 50.25, + "epoch": 0.8355, + "grad_norm": 6.139948844909668, + "kl": 25.0, + "learning_rate": 1.6132943205457607e-06, + "loss": 2.064, + "num_tokens": 47992555.0, + "reward": 1.5390625, + "reward_std": 0.8006330877542496, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4308478757739067, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.38524511456489563, + "step": 1671, + "token_counts/after_target": 687.5, + "token_counts/after_think": 53.5, + "token_counts/before_target": 1716.5, + "token_counts/before_think": 855.0 + }, + { + "avg_penalty/after_target": 2.3234779834747314, + "avg_penalty/after_think": 3.3904081284999847, + "avg_penalty/before_target": 0.23306728526949883, + "avg_penalty/before_think": 0.5053859800100327, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 154.609375, + "completions/mean_terminated_length": 154.609375, + "completions/min_length": 36.75, + "completions/min_terminated_length": 36.75, + "epoch": 0.836, + "grad_norm": 6.8273091316223145, + "kl": 18.3359375, + "learning_rate": 1.6038013546558695e-06, + "loss": 1.3917, + "num_tokens": 48012146.0, + "reward": 1.61328125, + "reward_std": 0.7484257221221924, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.3846946656703949, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.3696632981300354, + "step": 1672, + "token_counts/after_target": 229.25, + "token_counts/after_think": 64.0, + "token_counts/before_target": 1621.5, + "token_counts/before_think": 559.0 + }, + { + "avg_penalty/after_target": 2.107176512479782, + "avg_penalty/after_think": 3.852283298969269, + "avg_penalty/before_target": 0.5307598859071732, + "avg_penalty/before_think": 0.46849966794252396, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.25, + "completions/max_terminated_length": 642.25, + "completions/mean_length": 159.859375, + "completions/mean_terminated_length": 159.859375, + "completions/min_length": 42.25, + "completions/min_terminated_length": 42.25, + "epoch": 0.8365, + "grad_norm": 10.973267555236816, + "kl": 18.640625, + "learning_rate": 1.5943339650431578e-06, + "loss": 2.0358, + "num_tokens": 48031337.0, + "reward": 1.65625, + "reward_std": 0.7339277565479279, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3987511098384857, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.3529142737388611, + "step": 1673, + "token_counts/after_target": 535.0, + "token_counts/after_think": 111.75, + "token_counts/before_target": 1264.25, + "token_counts/before_think": 646.75 + }, + { + "avg_penalty/after_target": 2.1339240670204163, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4092235565185547, + "avg_penalty/before_think": 0.3734358847141266, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.25, + "completions/max_terminated_length": 541.25, + "completions/mean_length": 174.984375, + "completions/mean_terminated_length": 174.984375, + "completions/min_length": 43.5, + "completions/min_terminated_length": 43.5, + "epoch": 0.837, + "grad_norm": 3.6371278762817383, + "kl": 20.65625, + "learning_rate": 1.5848921805469396e-06, + "loss": 1.744, + "num_tokens": 48052552.0, + "reward": 1.6171875, + "reward_std": 0.7266508042812347, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.41194770485162735, + "rewards/tag_count_reward/mean": 0.8359375, + "rewards/tag_count_reward/std": 0.3325646445155144, + "step": 1674, + "token_counts/after_target": 324.75, + "token_counts/after_think": 38.75, + "token_counts/before_target": 1463.0, + "token_counts/before_think": 973.25 + }, + { + "avg_penalty/after_target": 2.2500410079956055, + "avg_penalty/after_think": 3.83624529838562, + "avg_penalty/before_target": 0.4055761806666851, + "avg_penalty/before_think": 0.4897492080926895, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 556.5, + "completions/max_terminated_length": 416.75, + "completions/mean_length": 174.953125, + "completions/mean_terminated_length": 161.68541717529297, + "completions/min_length": 39.75, + "completions/min_terminated_length": 39.75, + "epoch": 0.8375, + "grad_norm": 10.06994342803955, + "kl": 11.5751953125, + "learning_rate": 1.5754760299285255e-06, + "loss": 1.3757, + "num_tokens": 48074245.0, + "reward": 1.80859375, + "reward_std": 0.4822036325931549, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.2596946656703949, + "rewards/tag_count_reward/mean": 0.90234375, + "rewards/tag_count_reward/std": 0.20559876412153244, + "step": 1675, + "token_counts/after_target": 446.75, + "token_counts/after_think": 200.75, + "token_counts/before_target": 1325.0, + "token_counts/before_think": 826.75 + }, + { + "avg_penalty/after_target": 2.4674381613731384, + "avg_penalty/after_think": 3.93447345495224, + "avg_penalty/before_target": 0.3983985558152199, + "avg_penalty/before_think": 0.7806489244103432, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.25, + "completions/max_terminated_length": 656.25, + "completions/mean_length": 167.28125, + "completions/mean_terminated_length": 167.28125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.838, + "grad_norm": 5.953053951263428, + "kl": 23.375, + "learning_rate": 1.566085541871145e-06, + "loss": 1.9083, + "num_tokens": 48097351.0, + "reward": 1.5390625, + "reward_std": 0.8280062973499298, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4440634250640869, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.39972560852766037, + "step": 1676, + "token_counts/after_target": 469.25, + "token_counts/after_think": 42.0, + "token_counts/before_target": 1412.5, + "token_counts/before_think": 752.75 + }, + { + "avg_penalty/after_target": 2.4171994626522064, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3860083259642124, + "avg_penalty/before_think": 0.3784985691308975, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 584.5, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 171.5, + "completions/mean_terminated_length": 157.78854370117188, + "completions/min_length": 60.25, + "completions/min_terminated_length": 60.25, + "epoch": 0.8385, + "grad_norm": 3.181589365005493, + "kl": 19.21875, + "learning_rate": 1.5567207449798517e-06, + "loss": 1.7117, + "num_tokens": 48117623.0, + "reward": 1.66015625, + "reward_std": 0.687163770198822, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.37366948276758194, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.329166978597641, + "step": 1677, + "token_counts/after_target": 403.5, + "token_counts/after_think": 51.5, + "token_counts/before_target": 1584.75, + "token_counts/before_think": 704.25 + }, + { + "avg_penalty/after_target": 2.2339622378349304, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.39427559077739716, + "avg_penalty/before_think": 0.4879963621497154, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.5, + "completions/max_terminated_length": 528.5, + "completions/mean_length": 171.65625, + "completions/mean_terminated_length": 171.65625, + "completions/min_length": 43.5, + "completions/min_terminated_length": 43.5, + "epoch": 0.839, + "grad_norm": 2.8596088886260986, + "kl": 21.3125, + "learning_rate": 1.547381667781439e-06, + "loss": 1.8863, + "num_tokens": 48138273.0, + "reward": 1.69140625, + "reward_std": 0.6732946932315826, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.37149807065725327, + "rewards/tag_count_reward/mean": 0.86328125, + "rewards/tag_count_reward/std": 0.30873048305511475, + "step": 1678, + "token_counts/after_target": 407.0, + "token_counts/after_think": 41.5, + "token_counts/before_target": 1339.0, + "token_counts/before_think": 959.0 + }, + { + "avg_penalty/after_target": 2.085746556520462, + "avg_penalty/after_think": 3.684457838535309, + "avg_penalty/before_target": 0.35748642683029175, + "avg_penalty/before_think": 0.4045524224638939, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.75, + "completions/max_terminated_length": 686.75, + "completions/mean_length": 197.984375, + "completions/mean_terminated_length": 197.984375, + "completions/min_length": 45.25, + "completions/min_terminated_length": 45.25, + "epoch": 0.8395, + "grad_norm": 3.1927127838134766, + "kl": 19.5625, + "learning_rate": 1.538068338724361e-06, + "loss": 1.699, + "num_tokens": 48159392.0, + "reward": 1.58984375, + "reward_std": 0.7252746820449829, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4154609143733978, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.32885555177927017, + "step": 1679, + "token_counts/after_target": 477.5, + "token_counts/after_think": 102.5, + "token_counts/before_target": 1661.0, + "token_counts/before_think": 926.75 + }, + { + "avg_penalty/after_target": 2.320022463798523, + "avg_penalty/after_think": 3.962501287460327, + "avg_penalty/before_target": 0.5647250860929489, + "avg_penalty/before_think": 0.36365417391061783, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.25, + "completions/max_terminated_length": 561.25, + "completions/mean_length": 189.859375, + "completions/mean_terminated_length": 189.859375, + "completions/min_length": 36.75, + "completions/min_terminated_length": 36.75, + "epoch": 0.84, + "grad_norm": 5.2679362297058105, + "kl": 25.3125, + "learning_rate": 1.5287807861786308e-06, + "loss": 2.0702, + "num_tokens": 48182247.0, + "reward": 1.546875, + "reward_std": 0.8009445518255234, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4154609143733978, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.3890140950679779, + "step": 1680, + "token_counts/after_target": 571.25, + "token_counts/after_think": 73.75, + "token_counts/before_target": 1366.25, + "token_counts/before_think": 1026.5 + }, + { + "avg_penalty/after_target": 3.314361870288849, + "avg_penalty/after_think": 1.6719837188720703, + "avg_penalty/before_target": 0.2946103997528553, + "avg_penalty/before_think": 0.3892161287367344, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.75, + "completions/max_terminated_length": 499.75, + "completions/mean_length": 174.359375, + "completions/mean_terminated_length": 174.359375, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.8405, + "grad_norm": 3.6529459953308105, + "kl": 22.6875, + "learning_rate": 1.5195190384357405e-06, + "loss": 1.9092, + "num_tokens": 48209486.0, + "reward": 1.4921875, + "reward_std": 0.8425682336091995, + "rewards/accuracy_reward/mean": NaN, + "rewards/accuracy_reward/std": NaN, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4598134011030197, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.39718345552682877, + "step": 1681, + "token_counts/after_target": 580.5, + "token_counts/after_think": 45.75, + "token_counts/before_target": 1363.75, + "token_counts/before_think": 799.75 + }, + { + "avg_penalty/after_target": 2.72415691614151, + "avg_penalty/after_think": 3.5891664028167725, + "avg_penalty/before_target": 0.3062274605035782, + "avg_penalty/before_think": 0.36717547476291656, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 101.046875, + "completions/mean_terminated_length": 101.046875, + "completions/min_length": 37.75, + "completions/min_terminated_length": 37.75, + "epoch": 0.841, + "grad_norm": 4.51593542098999, + "kl": 15.52001953125, + "learning_rate": 1.5102831237085857e-06, + "loss": 1.3479, + "num_tokens": 48225297.0, + "reward": 1.7578125, + "reward_std": 0.5529232025146484, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.28610680997371674, + "rewards/tag_count_reward/mean": 0.8828125, + "rewards/tag_count_reward/std": 0.2557990700006485, + "step": 1682, + "token_counts/after_target": 196.0, + "token_counts/after_think": 27.0, + "token_counts/before_target": 885.25, + "token_counts/before_think": 508.5 + }, + { + "avg_penalty/after_target": 2.5494864881038666, + "avg_penalty/after_think": 3.993310868740082, + "avg_penalty/before_target": 0.4583541750907898, + "avg_penalty/before_think": 0.4959178790450096, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 731.0, + "completions/max_terminated_length": 642.25, + "completions/mean_length": 211.953125, + "completions/mean_terminated_length": 199.30729293823242, + "completions/min_length": 34.75, + "completions/min_terminated_length": 34.75, + "epoch": 0.8415, + "grad_norm": 6.607772350311279, + "kl": 22.0625, + "learning_rate": 1.5010730701313626e-06, + "loss": 2.0557, + "num_tokens": 48248094.0, + "reward": 1.578125, + "reward_std": 0.8748218417167664, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.11967839300632477, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4471946656703949, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.37949711829423904, + "step": 1683, + "token_counts/after_target": 808.5, + "token_counts/after_think": 102.5, + "token_counts/before_target": 1808.75, + "token_counts/before_think": 671.5 + }, + { + "avg_penalty/after_target": 2.955225110054016, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.41177619248628616, + "avg_penalty/before_think": 0.3537176698446274, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.5, + "completions/max_terminated_length": 509.5, + "completions/mean_length": 178.96875, + "completions/mean_terminated_length": 178.96875, + "completions/min_length": 46.75, + "completions/min_terminated_length": 46.75, + "epoch": 0.842, + "grad_norm": 6.720673561096191, + "kl": 22.078125, + "learning_rate": 1.4918889057594876e-06, + "loss": 2.0463, + "num_tokens": 48268124.0, + "reward": 1.59765625, + "reward_std": 0.7861086428165436, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42733466625213623, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.3669648915529251, + "step": 1684, + "token_counts/after_target": 689.5, + "token_counts/after_think": 3.5, + "token_counts/before_target": 1112.75, + "token_counts/before_think": 1057.75 + }, + { + "avg_penalty/after_target": 2.2221015989780426, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.36617784947156906, + "avg_penalty/before_think": 0.5785855576395988, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.5, + "completions/max_terminated_length": 526.5, + "completions/mean_length": 159.71875, + "completions/mean_terminated_length": 159.71875, + "completions/min_length": 42.75, + "completions/min_terminated_length": 42.75, + "epoch": 0.8425, + "grad_norm": 5.224572658538818, + "kl": 21.875, + "learning_rate": 1.4827306585695234e-06, + "loss": 1.7873, + "num_tokens": 48287802.0, + "reward": 1.55859375, + "reward_std": 0.8039072304964066, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4229728877544403, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.3911045789718628, + "step": 1685, + "token_counts/after_target": 336.5, + "token_counts/after_think": 148.75, + "token_counts/before_target": 1324.75, + "token_counts/before_think": 745.5 + }, + { + "avg_penalty/after_target": 2.0116962790489197, + "avg_penalty/after_think": 3.8689222931861877, + "avg_penalty/before_target": 0.3900386430323124, + "avg_penalty/before_think": 0.5149011984467506, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.75, + "completions/max_terminated_length": 499.75, + "completions/mean_length": 159.1875, + "completions/mean_terminated_length": 159.1875, + "completions/min_length": 41.75, + "completions/min_terminated_length": 41.75, + "epoch": 0.843, + "grad_norm": 4.352124214172363, + "kl": 17.7109375, + "learning_rate": 1.4735983564590784e-06, + "loss": 1.6566, + "num_tokens": 48307126.0, + "reward": 1.69921875, + "reward_std": 0.6652703732252121, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.3758598491549492, + "rewards/tag_count_reward/mean": 0.87109375, + "rewards/tag_count_reward/std": 0.30811311304569244, + "step": 1686, + "token_counts/after_target": 315.5, + "token_counts/after_think": 169.25, + "token_counts/before_target": 1379.25, + "token_counts/before_think": 683.0 + }, + { + "avg_penalty/after_target": 2.5443073511123657, + "avg_penalty/after_think": 2.9758604764938354, + "avg_penalty/before_target": 0.3613058440387249, + "avg_penalty/before_think": 0.4081430062651634, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 154.90625, + "completions/mean_terminated_length": 154.90625, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.8435, + "grad_norm": 2.438476324081421, + "kl": 20.296875, + "learning_rate": 1.4644920272467245e-06, + "loss": 1.7564, + "num_tokens": 48326464.0, + "reward": 1.68359375, + "reward_std": 0.6845770180225372, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.3683478757739067, + "rewards/tag_count_reward/mean": 0.85546875, + "rewards/tag_count_reward/std": 0.33864860236644745, + "step": 1687, + "token_counts/after_target": 344.0, + "token_counts/after_think": 28.0, + "token_counts/before_target": 1491.75, + "token_counts/before_think": 614.75 + }, + { + "avg_penalty/after_target": 2.764037102460861, + "avg_penalty/after_think": 3.8605732321739197, + "avg_penalty/before_target": 0.20815423130989075, + "avg_penalty/before_think": 0.3311100900173187, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.25, + "completions/max_terminated_length": 469.25, + "completions/mean_length": 135.734375, + "completions/mean_terminated_length": 135.734375, + "completions/min_length": 38.25, + "completions/min_terminated_length": 38.25, + "epoch": 0.844, + "grad_norm": 8.595236778259277, + "kl": 24.5625, + "learning_rate": 1.4554116986719258e-06, + "loss": 1.8479, + "num_tokens": 48345791.0, + "reward": 1.58984375, + "reward_std": 0.7789289504289627, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.3748450353741646, + "step": 1688, + "token_counts/after_target": 164.5, + "token_counts/after_think": 19.0, + "token_counts/before_target": 1365.5, + "token_counts/before_think": 622.75 + }, + { + "avg_penalty/after_target": 2.198421359062195, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.5571432784199715, + "avg_penalty/before_think": 0.43886488676071167, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 812.25, + "completions/max_terminated_length": 688.75, + "completions/mean_length": 244.625, + "completions/mean_terminated_length": 220.33482360839844, + "completions/min_length": 30.25, + "completions/min_terminated_length": 30.25, + "epoch": 0.8445, + "grad_norm": 5.057324409484863, + "kl": 29.0, + "learning_rate": 1.446357398394934e-06, + "loss": 2.3662, + "num_tokens": 48377303.0, + "reward": 1.47265625, + "reward_std": 0.9106693863868713, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.44495995342731476, + "rewards/tag_count_reward/mean": 0.72265625, + "rewards/tag_count_reward/std": 0.43264786899089813, + "step": 1689, + "token_counts/after_target": 1200.75, + "token_counts/after_think": 122.5, + "token_counts/before_target": 2130.75, + "token_counts/before_think": 460.0 + }, + { + "avg_penalty/after_target": 2.9210952520370483, + "avg_penalty/after_think": 3.954238176345825, + "avg_penalty/before_target": 0.21838980913162231, + "avg_penalty/before_think": 0.5742092877626419, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 617.25, + "completions/max_terminated_length": 617.25, + "completions/mean_length": 208.953125, + "completions/mean_terminated_length": 208.953125, + "completions/min_length": 42.5, + "completions/min_terminated_length": 42.5, + "epoch": 0.845, + "grad_norm": 3.5076003074645996, + "kl": 16.703125, + "learning_rate": 1.4373291539967182e-06, + "loss": 1.441, + "num_tokens": 48400932.0, + "reward": 1.6640625, + "reward_std": 0.6937754154205322, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.8671875, + "rewards/tag_count_reward/std": 0.3188851624727249, + "step": 1690, + "token_counts/after_target": 312.5, + "token_counts/after_think": 30.0, + "token_counts/before_target": 1592.25, + "token_counts/before_think": 1408.5 + }, + { + "avg_penalty/after_target": 2.2798668444156647, + "avg_penalty/after_think": 2.7282899618148804, + "avg_penalty/before_target": 0.43898720294237137, + "avg_penalty/before_think": 0.5394638925790787, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.5, + "completions/max_terminated_length": 524.5, + "completions/mean_length": 196.09375, + "completions/mean_terminated_length": 196.09375, + "completions/min_length": 41.75, + "completions/min_terminated_length": 41.75, + "epoch": 0.8455, + "grad_norm": 4.4528069496154785, + "kl": 19.5625, + "learning_rate": 1.4283269929788779e-06, + "loss": 1.8375, + "num_tokens": 48423242.0, + "reward": 1.5546875, + "reward_std": 0.7059993296861649, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.3921433389186859, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.335692323744297, + "step": 1691, + "token_counts/after_target": 711.0, + "token_counts/after_think": 68.5, + "token_counts/before_target": 1391.75, + "token_counts/before_think": 966.25 + }, + { + "avg_penalty/after_target": 2.3906748294830322, + "avg_penalty/after_think": 3.7318190336227417, + "avg_penalty/before_target": 0.4527171514928341, + "avg_penalty/before_think": 0.46494047343730927, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.25, + "completions/max_terminated_length": 640.25, + "completions/mean_length": 163.625, + "completions/mean_terminated_length": 163.625, + "completions/min_length": 40.25, + "completions/min_terminated_length": 40.25, + "epoch": 0.846, + "grad_norm": 9.554941177368164, + "kl": 16.765625, + "learning_rate": 1.4193509427635543e-06, + "loss": 1.762, + "num_tokens": 48442114.0, + "reward": 1.6953125, + "reward_std": 0.6696922928094864, + "rewards/accuracy_reward/mean": NaN, + "rewards/accuracy_reward/std": NaN, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.3758598491549492, + "rewards/tag_count_reward/mean": 0.8671875, + "rewards/tag_count_reward/std": 0.3067893497645855, + "step": 1692, + "token_counts/after_target": 554.25, + "token_counts/after_think": 48.25, + "token_counts/before_target": 1245.0, + "token_counts/before_think": 770.5 + }, + { + "avg_penalty/after_target": 2.448445498943329, + "avg_penalty/after_think": 0.7503447532653809, + "avg_penalty/before_target": 0.32409151643514633, + "avg_penalty/before_think": 0.3183739706873894, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.75, + "completions/max_terminated_length": 408.75, + "completions/mean_length": 151.484375, + "completions/mean_terminated_length": 151.484375, + "completions/min_length": 35.75, + "completions/min_terminated_length": 35.75, + "epoch": 0.8465, + "grad_norm": 4.707094669342041, + "kl": 21.875, + "learning_rate": 1.4104010306933558e-06, + "loss": 1.6975, + "num_tokens": 48459473.0, + "reward": 1.49609375, + "reward_std": 0.8077958226203918, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4260597825050354, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.3719446249306202, + "step": 1693, + "token_counts/after_target": 360.0, + "token_counts/after_think": 2.0, + "token_counts/before_target": 1314.0, + "token_counts/before_think": 747.75 + }, + { + "avg_penalty/after_target": 2.42692369222641, + "avg_penalty/after_think": 2.518388867378235, + "avg_penalty/before_target": 0.34705541282892227, + "avg_penalty/before_think": 0.47912295162677765, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.75, + "completions/max_terminated_length": 576.75, + "completions/mean_length": 193.5625, + "completions/mean_terminated_length": 193.5625, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.847, + "grad_norm": 3.8366501331329346, + "kl": 18.640625, + "learning_rate": 1.4014772840312663e-06, + "loss": 1.6294, + "num_tokens": 48482549.0, + "reward": 1.57421875, + "reward_std": 0.7830176055431366, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4255262687802315, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.37349018454551697, + "step": 1694, + "token_counts/after_target": 368.25, + "token_counts/after_think": 236.75, + "token_counts/before_target": 1687.75, + "token_counts/before_think": 804.25 + }, + { + "avg_penalty/after_target": 3.051117956638336, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.2650328502058983, + "avg_penalty/before_think": 0.3009229600429535, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 166.203125, + "completions/mean_terminated_length": 166.203125, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.8475, + "grad_norm": 3.95009708404541, + "kl": 19.484375, + "learning_rate": 1.3925797299605649e-06, + "loss": 1.6297, + "num_tokens": 48503474.0, + "reward": 1.578125, + "reward_std": 0.7926977574825287, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.43303824216127396, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.37600064277648926, + "step": 1695, + "token_counts/after_target": 414.75, + "token_counts/after_think": 8.5, + "token_counts/before_target": 1388.75, + "token_counts/before_think": 847.25 + }, + { + "avg_penalty/after_target": 2.697245866060257, + "avg_penalty/after_think": 0.7765948176383972, + "avg_penalty/before_target": 0.43757518008351326, + "avg_penalty/before_think": 0.3534466065466404, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 720.25, + "completions/max_terminated_length": 638.5, + "completions/mean_length": 210.421875, + "completions/mean_terminated_length": 186.32396697998047, + "completions/min_length": 45.5, + "completions/min_terminated_length": 45.5, + "epoch": 0.848, + "grad_norm": 3.998948574066162, + "kl": 27.5, + "learning_rate": 1.3837083955847418e-06, + "loss": 2.4456, + "num_tokens": 48525437.0, + "reward": 1.546875, + "reward_std": 0.7498559802770615, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4009781554341316, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.36363305896520615, + "step": 1696, + "token_counts/after_target": 860.0, + "token_counts/after_think": 4.75, + "token_counts/before_target": 1822.0, + "token_counts/before_think": 680.0 + }, + { + "avg_penalty/after_target": 2.0705889761447906, + "avg_penalty/after_think": 2.6661354303359985, + "avg_penalty/before_target": 0.4055969938635826, + "avg_penalty/before_think": 0.4758447855710983, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.25, + "completions/max_terminated_length": 638.25, + "completions/mean_length": 186.625, + "completions/mean_terminated_length": 186.625, + "completions/min_length": 36.5, + "completions/min_terminated_length": 36.5, + "epoch": 0.8485, + "grad_norm": 7.369972229003906, + "kl": 24.625, + "learning_rate": 1.3748633079274254e-06, + "loss": 1.8273, + "num_tokens": 48548949.0, + "reward": 1.44140625, + "reward_std": 0.797818124294281, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.45874278247356415, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.3712956979870796, + "step": 1697, + "token_counts/after_target": 365.0, + "token_counts/after_think": 19.25, + "token_counts/before_target": 1834.25, + "token_counts/before_think": 767.5 + }, + { + "avg_penalty/after_target": 2.092950224876404, + "avg_penalty/after_think": 1.9964739680290222, + "avg_penalty/before_target": 0.45329154282808304, + "avg_penalty/before_think": 0.7071559429168701, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.5, + "completions/max_terminated_length": 679.5, + "completions/mean_length": 215.25, + "completions/mean_terminated_length": 215.25, + "completions/min_length": 39.75, + "completions/min_terminated_length": 39.75, + "epoch": 0.849, + "grad_norm": 3.5069117546081543, + "kl": 21.96875, + "learning_rate": 1.3660444939322837e-06, + "loss": 1.9385, + "num_tokens": 48574197.0, + "reward": 1.47265625, + "reward_std": 0.8608347028493881, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.75390625, + "rewards/tag_count_reward/std": 0.41697952151298523, + "step": 1698, + "token_counts/after_target": 920.25, + "token_counts/after_think": 54.5, + "token_counts/before_target": 1506.5, + "token_counts/before_think": 962.75 + }, + { + "avg_penalty/after_target": 2.130541890859604, + "avg_penalty/after_think": 1.9897370338439941, + "avg_penalty/before_target": 0.40305325388908386, + "avg_penalty/before_think": 0.46283601969480515, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 639.25, + "completions/max_terminated_length": 538.0, + "completions/mean_length": 199.015625, + "completions/mean_terminated_length": 185.81354522705078, + "completions/min_length": 41.75, + "completions/min_terminated_length": 41.75, + "epoch": 0.8495, + "grad_norm": 4.106459140777588, + "kl": 21.703125, + "learning_rate": 1.3572519804629537e-06, + "loss": 1.7971, + "num_tokens": 48597078.0, + "reward": 1.55078125, + "reward_std": 0.8142253160476685, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4440634250640869, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.3913332372903824, + "step": 1699, + "token_counts/after_target": 405.75, + "token_counts/after_think": 162.5, + "token_counts/before_target": 1491.25, + "token_counts/before_think": 1124.75 + }, + { + "avg_penalty/after_target": 3.1568769216537476, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.267518512904644, + "avg_penalty/before_think": 0.4242200329899788, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.25, + "completions/max_terminated_length": 574.25, + "completions/mean_length": 190.15625, + "completions/mean_terminated_length": 190.15625, + "completions/min_length": 33.25, + "completions/min_terminated_length": 33.25, + "epoch": 0.85, + "grad_norm": 5.152677059173584, + "kl": 23.658203125, + "learning_rate": 1.3484857943029572e-06, + "loss": 1.9156, + "num_tokens": 48620800.0, + "reward": 1.5234375, + "reward_std": 0.6895812898874283, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.35956869274377823, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.3334116190671921, + "step": 1700, + "token_counts/after_target": 543.5, + "token_counts/after_think": 26.0, + "token_counts/before_target": 1623.5, + "token_counts/before_think": 849.5 + }, + { + "avg_penalty/after_target": 2.4927929043769836, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.49859610199928284, + "avg_penalty/before_think": 0.3404374197125435, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 761.25, + "completions/max_terminated_length": 761.25, + "completions/mean_length": 200.96875, + "completions/mean_terminated_length": 200.96875, + "completions/min_length": 41.25, + "completions/min_terminated_length": 41.25, + "epoch": 0.8505, + "grad_norm": 7.5928802490234375, + "kl": 29.90625, + "learning_rate": 1.339745962155613e-06, + "loss": 2.2864, + "num_tokens": 48643118.0, + "reward": 1.421875, + "reward_std": 1.0086461007595062, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.1280868947505951, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.49776528775691986, + "rewards/tag_count_reward/mean": 0.6875, + "rewards/tag_count_reward/std": 0.44608065485954285, + "step": 1701, + "token_counts/after_target": 807.5, + "token_counts/after_think": 21.0, + "token_counts/before_target": 1839.25, + "token_counts/before_think": 547.75 + }, + { + "avg_penalty/after_target": 2.541392147541046, + "avg_penalty/after_think": 1.7326217889785767, + "avg_penalty/before_target": 0.40671030804514885, + "avg_penalty/before_think": 0.41543880105018616, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 594.5, + "completions/max_terminated_length": 468.5, + "completions/mean_length": 192.03125, + "completions/mean_terminated_length": 179.08854293823242, + "completions/min_length": 40.75, + "completions/min_terminated_length": 40.75, + "epoch": 0.851, + "grad_norm": 4.355011463165283, + "kl": 19.34375, + "learning_rate": 1.3310325106439725e-06, + "loss": 1.7373, + "num_tokens": 48665136.0, + "reward": 1.625, + "reward_std": 0.7876180559396744, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.40311288833618164, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.38678042590618134, + "step": 1702, + "token_counts/after_target": 717.25, + "token_counts/after_think": 11.25, + "token_counts/before_target": 1606.0, + "token_counts/before_think": 738.0 + }, + { + "avg_penalty/after_target": 2.102221727371216, + "avg_penalty/after_think": 3.0649763345718384, + "avg_penalty/before_target": 0.43607527762651443, + "avg_penalty/before_think": 0.5312122255563736, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.25, + "completions/max_terminated_length": 579.25, + "completions/mean_length": 168.5625, + "completions/mean_terminated_length": 168.5625, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.8515, + "grad_norm": 2.347412347793579, + "kl": 19.453125, + "learning_rate": 1.322345466310717e-06, + "loss": 1.6116, + "num_tokens": 48686116.0, + "reward": 1.54296875, + "reward_std": 0.8238652050495148, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44938503205776215, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.37275421619415283, + "step": 1703, + "token_counts/after_target": 388.75, + "token_counts/after_think": 49.0, + "token_counts/before_target": 1371.0, + "token_counts/before_think": 888.25 + }, + { + "avg_penalty/after_target": 2.714837968349457, + "avg_penalty/after_think": 2.840079963207245, + "avg_penalty/before_target": 0.2388070747256279, + "avg_penalty/before_think": 0.35649652779102325, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.25, + "completions/max_terminated_length": 476.25, + "completions/mean_length": 150.828125, + "completions/mean_terminated_length": 150.828125, + "completions/min_length": 43.25, + "completions/min_terminated_length": 43.25, + "epoch": 0.852, + "grad_norm": 3.779088020324707, + "kl": 20.4375, + "learning_rate": 1.3136848556180893e-06, + "loss": 1.8507, + "num_tokens": 48705929.0, + "reward": 1.703125, + "reward_std": 0.6864040642976761, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.3604728877544403, + "rewards/tag_count_reward/mean": 0.859375, + "rewards/tag_count_reward/std": 0.3303109034895897, + "step": 1704, + "token_counts/after_target": 314.75, + "token_counts/after_think": 24.0, + "token_counts/before_target": 1287.25, + "token_counts/before_think": 787.25 + }, + { + "avg_penalty/after_target": 2.3767064213752747, + "avg_penalty/after_think": 2.803406774997711, + "avg_penalty/before_target": 0.48282110318541527, + "avg_penalty/before_think": 0.6235482394695282, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.5, + "completions/max_terminated_length": 550.5, + "completions/mean_length": 184.78125, + "completions/mean_terminated_length": 184.78125, + "completions/min_length": 45.75, + "completions/min_terminated_length": 45.75, + "epoch": 0.8525, + "grad_norm": 9.043021202087402, + "kl": 21.828125, + "learning_rate": 1.30505070494781e-06, + "loss": 2.1391, + "num_tokens": 48726267.0, + "reward": 1.5703125, + "reward_std": 0.8020909130573273, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.43303824216127396, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.3769688084721565, + "step": 1705, + "token_counts/after_target": 635.5, + "token_counts/after_think": 25.0, + "token_counts/before_target": 1381.75, + "token_counts/before_think": 914.25 + }, + { + "avg_penalty/after_target": 3.3709486722946167, + "avg_penalty/after_think": 3.977933943271637, + "avg_penalty/before_target": 0.4501035623252392, + "avg_penalty/before_think": 0.5055704340338707, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 702.5, + "completions/max_terminated_length": 535.75, + "completions/mean_length": 188.625, + "completions/mean_terminated_length": 175.6343765258789, + "completions/min_length": 46.25, + "completions/min_terminated_length": 46.25, + "epoch": 0.853, + "grad_norm": 6.662205696105957, + "kl": 23.375, + "learning_rate": 1.2964430406010032e-06, + "loss": 2.1493, + "num_tokens": 48750675.0, + "reward": 1.57421875, + "reward_std": 0.8045978099107742, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4101393073797226, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.3969013914465904, + "step": 1706, + "token_counts/after_target": 523.75, + "token_counts/after_think": 22.5, + "token_counts/before_target": 1804.0, + "token_counts/before_think": 667.75 + }, + { + "avg_penalty/after_target": 2.2835038900375366, + "avg_penalty/after_think": 3.887471914291382, + "avg_penalty/before_target": 0.43571847677230835, + "avg_penalty/before_think": 0.5649984106421471, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 183.65625, + "completions/mean_terminated_length": 183.65625, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.8535, + "grad_norm": 8.90345573425293, + "kl": 19.21875, + "learning_rate": 1.2878618887981064e-06, + "loss": 1.9681, + "num_tokens": 48772717.0, + "reward": 1.6796875, + "reward_std": 0.6870673596858978, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.3683478757739067, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.32585785537958145, + "step": 1707, + "token_counts/after_target": 672.75, + "token_counts/after_think": 42.25, + "token_counts/before_target": 1269.25, + "token_counts/before_think": 954.25 + }, + { + "avg_penalty/after_target": 1.9660767018795013, + "avg_penalty/after_think": 3.5275447368621826, + "avg_penalty/before_target": 0.3880608454346657, + "avg_penalty/before_think": 0.5396952927112579, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.75, + "completions/max_terminated_length": 477.75, + "completions/mean_length": 168.5625, + "completions/mean_terminated_length": 168.5625, + "completions/min_length": 48.25, + "completions/min_terminated_length": 48.25, + "epoch": 0.854, + "grad_norm": 3.359278678894043, + "kl": 15.88671875, + "learning_rate": 1.279307275678795e-06, + "loss": 1.4422, + "num_tokens": 48793233.0, + "reward": 1.6796875, + "reward_std": 0.6696932911872864, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3943893313407898, + "rewards/tag_count_reward/mean": 0.8671875, + "rewards/tag_count_reward/std": 0.2867991030216217, + "step": 1708, + "token_counts/after_target": 401.5, + "token_counts/after_think": 92.0, + "token_counts/before_target": 1265.5, + "token_counts/before_think": 938.0 + }, + { + "avg_penalty/after_target": 2.117002695798874, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.44071026891469955, + "avg_penalty/before_think": 0.3800739161670208, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 623.75, + "completions/max_terminated_length": 623.75, + "completions/mean_length": 157.296875, + "completions/mean_terminated_length": 157.296875, + "completions/min_length": 48.75, + "completions/min_terminated_length": 48.75, + "epoch": 0.8545, + "grad_norm": 3.974470615386963, + "kl": 16.0546875, + "learning_rate": 1.2707792273019049e-06, + "loss": 1.5213, + "num_tokens": 48811012.0, + "reward": 1.73046875, + "reward_std": 0.559573620557785, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.2979728877544403, + "rewards/tag_count_reward/mean": 0.87109375, + "rewards/tag_count_reward/std": 0.26879560947418213, + "step": 1709, + "token_counts/after_target": 399.5, + "token_counts/after_think": 9.0, + "token_counts/before_target": 1218.75, + "token_counts/before_think": 889.5 + }, + { + "avg_penalty/after_target": 1.948669046163559, + "avg_penalty/after_think": 2.9027096033096313, + "avg_penalty/before_target": 0.3410455361008644, + "avg_penalty/before_think": 0.43603289872407913, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.5, + "completions/max_terminated_length": 603.5, + "completions/mean_length": 189.65625, + "completions/mean_terminated_length": 189.65625, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.855, + "grad_norm": 8.048849105834961, + "kl": 25.03125, + "learning_rate": 1.2622777696453482e-06, + "loss": 1.943, + "num_tokens": 48834318.0, + "reward": 1.59765625, + "reward_std": 0.7838618904352188, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.37680521607398987, + "step": 1710, + "token_counts/after_target": 419.5, + "token_counts/after_think": 39.25, + "token_counts/before_target": 1882.0, + "token_counts/before_think": 693.75 + }, + { + "avg_penalty/after_target": 2.2256467938423157, + "avg_penalty/after_think": 3.7434433698654175, + "avg_penalty/before_target": 0.40821218863129616, + "avg_penalty/before_think": 0.3261697217822075, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.5, + "completions/max_terminated_length": 449.5, + "completions/mean_length": 136.96875, + "completions/mean_terminated_length": 136.96875, + "completions/min_length": 47.25, + "completions/min_terminated_length": 47.25, + "epoch": 0.8555, + "grad_norm": 8.184167861938477, + "kl": 28.0, + "learning_rate": 1.2538029286060428e-06, + "loss": 2.1382, + "num_tokens": 48849900.0, + "reward": 1.51953125, + "reward_std": 0.8013483136892319, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44187305867671967, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.37175214663147926, + "step": 1711, + "token_counts/after_target": 243.0, + "token_counts/after_think": 34.25, + "token_counts/before_target": 1302.5, + "token_counts/before_think": 611.75 + }, + { + "avg_penalty/after_target": 3.1690206229686737, + "avg_penalty/after_think": 3.3679061233997345, + "avg_penalty/before_target": 0.31484250351786613, + "avg_penalty/before_think": 0.494671031832695, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.25, + "completions/max_terminated_length": 573.25, + "completions/mean_length": 173.359375, + "completions/mean_terminated_length": 173.359375, + "completions/min_length": 40.5, + "completions/min_terminated_length": 40.5, + "epoch": 0.856, + "grad_norm": 4.924840450286865, + "kl": 23.359375, + "learning_rate": 1.2453547299998226e-06, + "loss": 1.931, + "num_tokens": 48870243.0, + "reward": 1.5234375, + "reward_std": 0.8547840416431427, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4440634250640869, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.4205542504787445, + "step": 1712, + "token_counts/after_target": 434.0, + "token_counts/after_think": 79.25, + "token_counts/before_target": 1719.75, + "token_counts/before_think": 540.75 + }, + { + "avg_penalty/after_target": 2.058002918958664, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3949975520372391, + "avg_penalty/before_think": 0.4255737215280533, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.75, + "completions/max_terminated_length": 557.75, + "completions/mean_length": 172.953125, + "completions/mean_terminated_length": 172.953125, + "completions/min_length": 43.5, + "completions/min_terminated_length": 43.5, + "epoch": 0.8565, + "grad_norm": 7.111891269683838, + "kl": 22.109375, + "learning_rate": 1.2369331995613664e-06, + "loss": 1.7362, + "num_tokens": 48892528.0, + "reward": 1.59375, + "reward_std": 0.7903278768062592, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.37884192168712616, + "step": 1713, + "token_counts/after_target": 460.0, + "token_counts/after_think": 41.75, + "token_counts/before_target": 1486.5, + "token_counts/before_think": 779.0 + }, + { + "avg_penalty/after_target": 2.876542389392853, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.32609451934695244, + "avg_penalty/before_think": 0.43946850299835205, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.25, + "completions/max_terminated_length": 590.25, + "completions/mean_length": 184.03125, + "completions/mean_terminated_length": 184.03125, + "completions/min_length": 49.75, + "completions/min_terminated_length": 49.75, + "epoch": 0.857, + "grad_norm": 4.382112503051758, + "kl": 22.109375, + "learning_rate": 1.228538362944115e-06, + "loss": 1.9867, + "num_tokens": 48916754.0, + "reward": 1.546875, + "reward_std": 0.7665388435125351, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44187305867671967, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.35080981254577637, + "step": 1714, + "token_counts/after_target": 619.75, + "token_counts/after_think": 46.75, + "token_counts/before_target": 1751.25, + "token_counts/before_think": 526.75 + }, + { + "avg_penalty/after_target": 1.861255705356598, + "avg_penalty/after_think": 3.505715250968933, + "avg_penalty/before_target": 0.45330047607421875, + "avg_penalty/before_think": 0.31743336468935013, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.5, + "completions/max_terminated_length": 551.5, + "completions/mean_length": 150.90625, + "completions/mean_terminated_length": 150.90625, + "completions/min_length": 34.75, + "completions/min_terminated_length": 34.75, + "epoch": 0.8575, + "grad_norm": 4.933646202087402, + "kl": 19.5703125, + "learning_rate": 1.2201702457201948e-06, + "loss": 1.7624, + "num_tokens": 48936268.0, + "reward": 1.75390625, + "reward_std": 0.7344101071357727, + "rewards/accuracy_reward/mean": NaN, + "rewards/accuracy_reward/std": NaN, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.356952004134655, + "rewards/tag_count_reward/mean": 0.86328125, + "rewards/tag_count_reward/std": 0.3471444100141525, + "step": 1715, + "token_counts/after_target": 378.25, + "token_counts/after_think": 65.75, + "token_counts/before_target": 1073.25, + "token_counts/before_think": 897.25 + }, + { + "avg_penalty/after_target": 2.5681630223989487, + "avg_penalty/after_think": 2.1662399768829346, + "avg_penalty/before_target": 0.3542369157075882, + "avg_penalty/before_think": 0.49771157652139664, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.5, + "completions/max_terminated_length": 580.5, + "completions/mean_length": 201.0, + "completions/mean_terminated_length": 201.0, + "completions/min_length": 42.5, + "completions/min_terminated_length": 42.5, + "epoch": 0.858, + "grad_norm": 3.6886656284332275, + "kl": 23.78125, + "learning_rate": 1.2118288733803474e-06, + "loss": 2.0949, + "num_tokens": 48957692.0, + "reward": 1.59375, + "reward_std": 0.8001217693090439, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.40263500809669495, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.3819338381290436, + "step": 1716, + "token_counts/after_target": 733.5, + "token_counts/after_think": 27.5, + "token_counts/before_target": 1732.25, + "token_counts/before_think": 722.75 + }, + { + "avg_penalty/after_target": 2.1593639254570007, + "avg_penalty/after_think": 2.8882389664649963, + "avg_penalty/before_target": 0.3825322948396206, + "avg_penalty/before_think": 0.4017805829644203, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 523.5, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 154.15625, + "completions/mean_terminated_length": 141.06250381469727, + "completions/min_length": 39.25, + "completions/min_terminated_length": 39.25, + "epoch": 0.8585, + "grad_norm": 5.067332744598389, + "kl": 13.9521484375, + "learning_rate": 1.2035142713338366e-06, + "loss": 1.3398, + "num_tokens": 48979942.0, + "reward": 1.75390625, + "reward_std": 0.5440884679555893, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.2979728877544403, + "rewards/tag_count_reward/mean": 0.89453125, + "rewards/tag_count_reward/std": 0.2640974894165993, + "step": 1717, + "token_counts/after_target": 331.5, + "token_counts/after_think": 113.5, + "token_counts/before_target": 1151.5, + "token_counts/before_think": 870.0 + }, + { + "avg_penalty/after_target": 2.250088155269623, + "avg_penalty/after_think": 3.7103936076164246, + "avg_penalty/before_target": 0.3786366991698742, + "avg_penalty/before_think": 0.5518981963396072, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 155.15625, + "completions/mean_terminated_length": 155.15625, + "completions/min_length": 53.75, + "completions/min_terminated_length": 53.75, + "epoch": 0.859, + "grad_norm": 5.5202717781066895, + "kl": 22.59375, + "learning_rate": 1.19522646490838e-06, + "loss": 1.8281, + "num_tokens": 48999136.0, + "reward": 1.53515625, + "reward_std": 0.8149887472391129, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.43655145168304443, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.38785432279109955, + "step": 1718, + "token_counts/after_target": 542.5, + "token_counts/after_think": 67.0, + "token_counts/before_target": 1201.75, + "token_counts/before_think": 671.25 + }, + { + "avg_penalty/after_target": 1.9586450457572937, + "avg_penalty/after_think": 2.7595783472061157, + "avg_penalty/before_target": 0.44506874680519104, + "avg_penalty/before_think": 0.516104444861412, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 554.25, + "completions/max_terminated_length": 554.25, + "completions/mean_length": 160.96875, + "completions/mean_terminated_length": 160.96875, + "completions/min_length": 44.25, + "completions/min_terminated_length": 44.25, + "epoch": 0.8595, + "grad_norm": 5.720561981201172, + "kl": 20.453125, + "learning_rate": 1.1869654793500784e-06, + "loss": 1.9473, + "num_tokens": 49020750.0, + "reward": 1.66015625, + "reward_std": 0.8153094202280045, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.37188082188367844, + "step": 1719, + "token_counts/after_target": 520.0, + "token_counts/after_think": 48.25, + "token_counts/before_target": 1125.0, + "token_counts/before_think": 882.25 + }, + { + "avg_penalty/after_target": 2.653766930103302, + "avg_penalty/after_think": 3.939930558204651, + "avg_penalty/before_target": 0.5425318777561188, + "avg_penalty/before_think": 0.31886059790849686, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 564.25, + "completions/max_terminated_length": 361.75, + "completions/mean_length": 136.46875, + "completions/mean_terminated_length": 121.79687690734863, + "completions/min_length": 36.75, + "completions/min_terminated_length": 36.75, + "epoch": 0.86, + "grad_norm": 3.953099489212036, + "kl": 21.0380859375, + "learning_rate": 1.1787313398233235e-06, + "loss": 2.002, + "num_tokens": 49038844.0, + "reward": 1.6640625, + "reward_std": 0.684683158993721, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3837348371744156, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.3090904578566551, + "step": 1720, + "token_counts/after_target": 563.5, + "token_counts/after_think": 25.0, + "token_counts/before_target": 1112.5, + "token_counts/before_think": 482.5 + }, + { + "avg_penalty/after_target": 2.8643866777420044, + "avg_penalty/after_think": 0.765235424041748, + "avg_penalty/before_target": 0.42592037841677666, + "avg_penalty/before_think": 0.33277321234345436, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.5, + "completions/max_terminated_length": 513.5, + "completions/mean_length": 168.984375, + "completions/mean_terminated_length": 168.984375, + "completions/min_length": 28.5, + "completions/min_terminated_length": 28.5, + "epoch": 0.8605, + "grad_norm": 7.224451541900635, + "kl": 30.0, + "learning_rate": 1.1705240714107301e-06, + "loss": 2.2756, + "num_tokens": 49059307.0, + "reward": 1.41796875, + "reward_std": 0.8825975209474564, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.47360680997371674, + "rewards/tag_count_reward/mean": 0.73046875, + "rewards/tag_count_reward/std": 0.4148871600627899, + "step": 1721, + "token_counts/after_target": 640.5, + "token_counts/after_think": 41.5, + "token_counts/before_target": 1379.75, + "token_counts/before_think": 642.0 + }, + { + "avg_penalty/after_target": 1.9246060848236084, + "avg_penalty/after_think": 2.773300588130951, + "avg_penalty/before_target": 0.2888857163488865, + "avg_penalty/before_think": 0.5344186723232269, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.5, + "completions/max_terminated_length": 490.5, + "completions/mean_length": 168.671875, + "completions/mean_terminated_length": 168.671875, + "completions/min_length": 47.5, + "completions/min_terminated_length": 47.5, + "epoch": 0.861, + "grad_norm": 3.729513645172119, + "kl": 16.703125, + "learning_rate": 1.1623436991130654e-06, + "loss": 1.4525, + "num_tokens": 49080678.0, + "reward": 1.62890625, + "reward_std": 0.6490161269903183, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4000816270709038, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.2831771746277809, + "step": 1722, + "token_counts/after_target": 252.25, + "token_counts/after_think": 71.75, + "token_counts/before_target": 1314.5, + "token_counts/before_think": 1060.25 + }, + { + "avg_penalty/after_target": 1.4775705337524414, + "avg_penalty/after_think": 3.7518887519836426, + "avg_penalty/before_target": 0.4777369052171707, + "avg_penalty/before_think": 0.5376485139131546, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 699.5, + "completions/max_terminated_length": 699.5, + "completions/mean_length": 214.484375, + "completions/mean_terminated_length": 214.484375, + "completions/min_length": 42.5, + "completions/min_terminated_length": 42.5, + "epoch": 0.8615, + "grad_norm": 6.214291572570801, + "kl": 22.4140625, + "learning_rate": 1.1541902478491607e-06, + "loss": 2.021, + "num_tokens": 49105365.0, + "reward": 1.5546875, + "reward_std": 0.7637314796447754, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4215351790189743, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.36212165653705597, + "step": 1723, + "token_counts/after_target": 743.5, + "token_counts/after_think": 117.0, + "token_counts/before_target": 1583.25, + "token_counts/before_think": 988.0 + }, + { + "avg_penalty/after_target": 1.7203711569309235, + "avg_penalty/after_think": 3.5835355520248413, + "avg_penalty/before_target": 0.48033007234334946, + "avg_penalty/before_think": 0.4993447810411453, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 649.0, + "completions/max_terminated_length": 649.0, + "completions/mean_length": 238.40625, + "completions/mean_terminated_length": 238.40625, + "completions/min_length": 43.5, + "completions/min_terminated_length": 43.5, + "epoch": 0.862, + "grad_norm": 11.070988655090332, + "kl": 28.71875, + "learning_rate": 1.1460637424558406e-06, + "loss": 2.1149, + "num_tokens": 49133023.0, + "reward": 1.41015625, + "reward_std": 0.8957777619361877, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.46566852182149887, + "rewards/tag_count_reward/mean": 0.72265625, + "rewards/tag_count_reward/std": 0.4430803880095482, + "step": 1724, + "token_counts/after_target": 871.5, + "token_counts/after_think": 129.0, + "token_counts/before_target": 1847.75, + "token_counts/before_think": 966.25 + }, + { + "avg_penalty/after_target": 2.5054189562797546, + "avg_penalty/after_think": 3.7861708402633667, + "avg_penalty/before_target": 0.5166428536176682, + "avg_penalty/before_think": 0.38950005918741226, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 805.75, + "completions/max_terminated_length": 614.25, + "completions/mean_length": 210.640625, + "completions/mean_terminated_length": 196.61979293823242, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.8625, + "grad_norm": 8.575618743896484, + "kl": 28.78125, + "learning_rate": 1.1379642076878528e-06, + "loss": 2.4615, + "num_tokens": 49155992.0, + "reward": 1.42578125, + "reward_std": 0.8750710487365723, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.47360680997371674, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.41443127393722534, + "step": 1725, + "token_counts/after_target": 892.5, + "token_counts/after_think": 85.75, + "token_counts/before_target": 1742.0, + "token_counts/before_think": 650.0 + }, + { + "avg_penalty/after_target": 2.52771058678627, + "avg_penalty/after_think": 3.232421398162842, + "avg_penalty/before_target": 0.45632168650627136, + "avg_penalty/before_think": 0.5416165068745613, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.75, + "completions/max_terminated_length": 707.75, + "completions/mean_length": 236.109375, + "completions/mean_terminated_length": 236.109375, + "completions/min_length": 51.75, + "completions/min_terminated_length": 51.75, + "epoch": 0.863, + "grad_norm": 9.842007637023926, + "kl": 29.46875, + "learning_rate": 1.129891668217783e-06, + "loss": 2.2011, + "num_tokens": 49182799.0, + "reward": 1.375, + "reward_std": 0.8832892626523972, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.45916909724473953, + "rewards/tag_count_reward/mean": 0.703125, + "rewards/tag_count_reward/std": 0.42993561178445816, + "step": 1726, + "token_counts/after_target": 829.0, + "token_counts/after_think": 192.5, + "token_counts/before_target": 1918.5, + "token_counts/before_think": 837.75 + }, + { + "avg_penalty/after_target": 2.6649001240730286, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.34611431509256363, + "avg_penalty/before_think": 0.4525621756911278, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 632.0, + "completions/max_terminated_length": 632.0, + "completions/mean_length": 188.296875, + "completions/mean_terminated_length": 188.296875, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.8635, + "grad_norm": 2.9183409214019775, + "kl": 18.21484375, + "learning_rate": 1.1218461486359878e-06, + "loss": 1.5927, + "num_tokens": 49204226.0, + "reward": 1.65234375, + "reward_std": 0.7117946147918701, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3837348371744156, + "rewards/tag_count_reward/mean": 0.83984375, + "rewards/tag_count_reward/std": 0.33463743329048157, + "step": 1727, + "token_counts/after_target": 634.5, + "token_counts/after_think": 55.5, + "token_counts/before_target": 1402.25, + "token_counts/before_think": 920.5 + }, + { + "avg_penalty/after_target": 2.1135909855365753, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4366611912846565, + "avg_penalty/before_think": 0.34620755165815353, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 654.0, + "completions/max_terminated_length": 654.0, + "completions/mean_length": 214.671875, + "completions/mean_terminated_length": 214.671875, + "completions/min_length": 43.75, + "completions/min_terminated_length": 43.75, + "epoch": 0.864, + "grad_norm": 5.289516448974609, + "kl": 24.09375, + "learning_rate": 1.1138276734505105e-06, + "loss": 1.9709, + "num_tokens": 49230061.0, + "reward": 1.64453125, + "reward_std": 0.7564205974340439, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3987511098384857, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.3596511110663414, + "step": 1728, + "token_counts/after_target": 592.0, + "token_counts/after_think": 20.25, + "token_counts/before_target": 1935.0, + "token_counts/before_think": 887.5 + }, + { + "avg_penalty/after_target": 2.14611479640007, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4804936461150646, + "avg_penalty/before_think": 0.3288051299750805, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 657.25, + "completions/max_terminated_length": 657.25, + "completions/mean_length": 217.265625, + "completions/mean_terminated_length": 217.265625, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.8645, + "grad_norm": 2.5987420082092285, + "kl": 16.6015625, + "learning_rate": 1.1058362670870248e-06, + "loss": 1.4836, + "num_tokens": 49252238.0, + "reward": 1.78515625, + "reward_std": 0.6491455286741257, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.34944770485162735, + "rewards/tag_count_reward/mean": 0.89453125, + "rewards/tag_count_reward/std": 0.2682444304227829, + "step": 1729, + "token_counts/after_target": 567.75, + "token_counts/after_think": 45.75, + "token_counts/before_target": 1443.75, + "token_counts/before_think": 1419.0 + }, + { + "avg_penalty/after_target": 2.73309925198555, + "avg_penalty/after_think": 2.3577497601509094, + "avg_penalty/before_target": 0.36160455271601677, + "avg_penalty/before_think": 0.3569915257394314, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.25, + "completions/max_terminated_length": 646.25, + "completions/mean_length": 147.6875, + "completions/mean_terminated_length": 147.6875, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.865, + "grad_norm": 4.69894552230835, + "kl": 25.5, + "learning_rate": 1.097871953888735e-06, + "loss": 2.2046, + "num_tokens": 49269898.0, + "reward": 1.5703125, + "reward_std": 0.7748787105083466, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4440634250640869, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.35034405440092087, + "step": 1730, + "token_counts/after_target": 561.0, + "token_counts/after_think": 11.75, + "token_counts/before_target": 1117.25, + "token_counts/before_think": 673.0 + }, + { + "avg_penalty/after_target": 2.27588751912117, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4954063221812248, + "avg_penalty/before_think": 0.5090097784996033, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 197.78125, + "completions/mean_terminated_length": 197.78125, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.8655, + "grad_norm": 4.931818962097168, + "kl": 20.5625, + "learning_rate": 1.0899347581163222e-06, + "loss": 1.967, + "num_tokens": 49293180.0, + "reward": 1.60546875, + "reward_std": 0.7320714890956879, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4075859263539314, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.34014473855495453, + "step": 1731, + "token_counts/after_target": 703.25, + "token_counts/after_think": 33.0, + "token_counts/before_target": 1692.25, + "token_counts/before_think": 736.0 + }, + { + "avg_penalty/after_target": 2.9685537219047546, + "avg_penalty/after_think": 3.834640622138977, + "avg_penalty/before_target": 0.27134596928954124, + "avg_penalty/before_think": 0.402416430413723, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.5, + "completions/max_terminated_length": 341.5, + "completions/mean_length": 142.53125, + "completions/mean_terminated_length": 142.53125, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.866, + "grad_norm": 3.7882397174835205, + "kl": 13.15625, + "learning_rate": 1.0820247039478605e-06, + "loss": 1.189, + "num_tokens": 49312014.0, + "reward": 1.6953125, + "reward_std": 0.7006063759326935, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38336414843797684, + "rewards/tag_count_reward/mean": 0.8671875, + "rewards/tag_count_reward/std": 0.33392197638750076, + "step": 1732, + "token_counts/after_target": 159.75, + "token_counts/after_think": 43.5, + "token_counts/before_target": 1181.5, + "token_counts/before_think": 895.75 + }, + { + "avg_penalty/after_target": 2.3090361952781677, + "avg_penalty/after_think": 3.0896764993667603, + "avg_penalty/before_target": 0.47261540591716766, + "avg_penalty/before_think": 0.36684810370206833, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 632.25, + "completions/max_terminated_length": 632.25, + "completions/mean_length": 190.71875, + "completions/mean_terminated_length": 190.71875, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.8665, + "grad_norm": 4.793491840362549, + "kl": 25.6171875, + "learning_rate": 1.0741418154787443e-06, + "loss": 2.083, + "num_tokens": 49337708.0, + "reward": 1.4921875, + "reward_std": 0.8042746335268021, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4339347705245018, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.3741271495819092, + "step": 1733, + "token_counts/after_target": 663.75, + "token_counts/after_think": 45.5, + "token_counts/before_target": 1648.0, + "token_counts/before_think": 694.25 + }, + { + "avg_penalty/after_target": 2.0366270542144775, + "avg_penalty/after_think": 3.664156436920166, + "avg_penalty/before_target": 0.4302789866924286, + "avg_penalty/before_think": 0.4087802916765213, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 658.0, + "completions/max_terminated_length": 513.0, + "completions/mean_length": 184.6875, + "completions/mean_terminated_length": 171.1500015258789, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.867, + "grad_norm": 7.177804470062256, + "kl": 26.21484375, + "learning_rate": 1.0662861167216243e-06, + "loss": 2.0637, + "num_tokens": 49360840.0, + "reward": 1.546875, + "reward_std": 0.658893495798111, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.35956869274377823, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.30919238179922104, + "step": 1734, + "token_counts/after_target": 589.0, + "token_counts/after_think": 50.25, + "token_counts/before_target": 1685.25, + "token_counts/before_think": 630.5 + }, + { + "avg_penalty/after_target": 2.5675248503684998, + "avg_penalty/after_think": 3.649478495121002, + "avg_penalty/before_target": 0.3346395865082741, + "avg_penalty/before_think": 0.44298846274614334, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 593.25, + "completions/max_terminated_length": 443.75, + "completions/mean_length": 159.109375, + "completions/mean_terminated_length": 144.59271049499512, + "completions/min_length": 43.25, + "completions/min_terminated_length": 43.25, + "epoch": 0.8675, + "grad_norm": 5.846911430358887, + "kl": 13.5546875, + "learning_rate": 1.058457631606319e-06, + "loss": 1.4436, + "num_tokens": 49383471.0, + "reward": 1.8671875, + "reward_std": 0.3734663128852844, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.18616948276758194, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.1704004555940628, + "step": 1735, + "token_counts/after_target": 354.0, + "token_counts/after_think": 55.75, + "token_counts/before_target": 1172.0, + "token_counts/before_think": 964.0 + }, + { + "avg_penalty/after_target": 2.525357663631439, + "avg_penalty/after_think": 2.9187320470809937, + "avg_penalty/before_target": 0.38369467854499817, + "avg_penalty/before_think": 0.5224636495113373, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.5, + "completions/max_terminated_length": 603.5, + "completions/mean_length": 191.90625, + "completions/mean_terminated_length": 191.90625, + "completions/min_length": 33.5, + "completions/min_terminated_length": 33.5, + "epoch": 0.868, + "grad_norm": 3.3311140537261963, + "kl": 15.8125, + "learning_rate": 1.0506563839797501e-06, + "loss": 1.3085, + "num_tokens": 49405785.0, + "reward": 1.64453125, + "reward_std": 0.7467852979898453, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.39123913645744324, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.35990096628665924, + "step": 1736, + "token_counts/after_target": 530.0, + "token_counts/after_think": 17.0, + "token_counts/before_target": 1567.25, + "token_counts/before_think": 956.25 + }, + { + "avg_penalty/after_target": 2.3705747425556183, + "avg_penalty/after_think": 2.7918891310691833, + "avg_penalty/before_target": 0.3304919935762882, + "avg_penalty/before_think": 0.39221857115626335, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.25, + "completions/max_terminated_length": 536.25, + "completions/mean_length": 196.21875, + "completions/mean_terminated_length": 196.21875, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.8685, + "grad_norm": 2.981351852416992, + "kl": 19.34375, + "learning_rate": 1.042882397605871e-06, + "loss": 1.7343, + "num_tokens": 49429271.0, + "reward": 1.7109375, + "reward_std": 0.7007052153348923, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.34944770485162735, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.3549359068274498, + "step": 1737, + "token_counts/after_target": 372.5, + "token_counts/after_think": 139.0, + "token_counts/before_target": 1592.75, + "token_counts/before_think": 1035.25 + }, + { + "avg_penalty/after_target": 2.5948815047740936, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4188213497400284, + "avg_penalty/before_think": 0.4412868991494179, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.0, + "completions/max_terminated_length": 614.0, + "completions/mean_length": 195.515625, + "completions/mean_terminated_length": 195.515625, + "completions/min_length": 33.5, + "completions/min_terminated_length": 33.5, + "epoch": 0.869, + "grad_norm": 2.7963905334472656, + "kl": 15.5, + "learning_rate": 1.0351356961655945e-06, + "loss": 1.4179, + "num_tokens": 49450440.0, + "reward": 1.61328125, + "reward_std": 0.7911299765110016, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4141380712389946, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.38288672268390656, + "step": 1738, + "token_counts/after_target": 509.25, + "token_counts/after_think": 50.25, + "token_counts/before_target": 1557.25, + "token_counts/before_think": 1011.5 + }, + { + "avg_penalty/after_target": 2.9113535284996033, + "avg_penalty/after_think": 1.4253523349761963, + "avg_penalty/before_target": 0.32832255214452744, + "avg_penalty/before_think": 0.38888875395059586, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 639.75, + "completions/max_terminated_length": 478.75, + "completions/mean_length": 175.28125, + "completions/mean_terminated_length": 161.60416793823242, + "completions/min_length": 47.25, + "completions/min_terminated_length": 47.25, + "epoch": 0.8695, + "grad_norm": 3.1821773052215576, + "kl": 23.75, + "learning_rate": 1.0274163032567165e-06, + "loss": 2.0507, + "num_tokens": 49471434.0, + "reward": 1.61328125, + "reward_std": 0.7644963711500168, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.3979102149605751, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.37074608355760574, + "step": 1739, + "token_counts/after_target": 473.5, + "token_counts/after_think": 20.75, + "token_counts/before_target": 1508.25, + "token_counts/before_think": 802.0 + }, + { + "avg_penalty/after_target": 2.4643019437789917, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.35704831033945084, + "avg_penalty/before_think": 0.34273403882980347, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 174.9375, + "completions/mean_terminated_length": 174.9375, + "completions/min_length": 38.75, + "completions/min_terminated_length": 38.75, + "epoch": 0.87, + "grad_norm": 8.857806205749512, + "kl": 14.0390625, + "learning_rate": 1.0197242423938447e-06, + "loss": 1.51, + "num_tokens": 49491014.0, + "reward": 1.703125, + "reward_std": 0.6154037714004517, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.39123913645744324, + "rewards/tag_count_reward/mean": 0.890625, + "rewards/tag_count_reward/std": 0.22731868363916874, + "step": 1740, + "token_counts/after_target": 419.25, + "token_counts/after_think": 108.0, + "token_counts/before_target": 1338.25, + "token_counts/before_think": 933.5 + }, + { + "avg_penalty/after_target": 2.3824096024036407, + "avg_penalty/after_think": 3.7338095903396606, + "avg_penalty/before_target": 0.4651809558272362, + "avg_penalty/before_think": 0.5814906284213066, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 738.0, + "completions/max_terminated_length": 686.5, + "completions/mean_length": 240.875, + "completions/mean_terminated_length": 230.12709045410156, + "completions/min_length": 47.75, + "completions/min_terminated_length": 47.75, + "epoch": 0.8705, + "grad_norm": 3.960862874984741, + "kl": 25.09375, + "learning_rate": 1.012059537008332e-06, + "loss": 2.2625, + "num_tokens": 49515502.0, + "reward": 1.5546875, + "reward_std": 0.8049565255641937, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4440634250640869, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.38058697432279587, + "step": 1741, + "token_counts/after_target": 844.75, + "token_counts/after_think": 217.25, + "token_counts/before_target": 1892.0, + "token_counts/before_think": 900.0 + }, + { + "avg_penalty/after_target": 2.0979692935943604, + "avg_penalty/after_think": 2.8796483278274536, + "avg_penalty/before_target": 0.3024883382022381, + "avg_penalty/before_think": 0.47916504740715027, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.5, + "completions/max_terminated_length": 543.5, + "completions/mean_length": 169.484375, + "completions/mean_terminated_length": 169.484375, + "completions/min_length": 38.25, + "completions/min_terminated_length": 38.25, + "epoch": 0.871, + "grad_norm": 4.967376708984375, + "kl": 12.205078125, + "learning_rate": 1.004422210448197e-06, + "loss": 1.2066, + "num_tokens": 49537005.0, + "reward": 1.69140625, + "reward_std": 0.5693006068468094, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.33226002007722855, + "rewards/tag_count_reward/mean": 0.87890625, + "rewards/tag_count_reward/std": 0.2605842798948288, + "step": 1742, + "token_counts/after_target": 381.25, + "token_counts/after_think": 159.25, + "token_counts/before_target": 1360.0, + "token_counts/before_think": 811.25 + }, + { + "avg_penalty/after_target": 1.8663058280944824, + "avg_penalty/after_think": 3.9582483768463135, + "avg_penalty/before_target": 0.2675296850502491, + "avg_penalty/before_think": 0.45867444574832916, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 178.84375, + "completions/mean_terminated_length": 178.84375, + "completions/min_length": 35.75, + "completions/min_terminated_length": 35.75, + "epoch": 0.8715, + "grad_norm": 7.169205665588379, + "kl": 19.265625, + "learning_rate": 9.968122859780648e-07, + "loss": 1.4544, + "num_tokens": 49558259.0, + "reward": 1.64453125, + "reward_std": 0.7485761344432831, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.37366948276758194, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.3627830296754837, + "step": 1743, + "token_counts/after_target": 274.0, + "token_counts/after_think": 50.75, + "token_counts/before_target": 1732.0, + "token_counts/before_think": 804.75 + }, + { + "avg_penalty/after_target": 1.7344196289777756, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4414417892694473, + "avg_penalty/before_think": 0.4096025303006172, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 779.25, + "completions/max_terminated_length": 779.25, + "completions/mean_length": 271.328125, + "completions/mean_terminated_length": 271.328125, + "completions/min_length": 60.5, + "completions/min_terminated_length": 60.5, + "epoch": 0.872, + "grad_norm": 12.07043170928955, + "kl": 31.90625, + "learning_rate": 9.892297867790846e-07, + "loss": 2.3728, + "num_tokens": 49585928.0, + "reward": 1.421875, + "reward_std": 0.8961028456687927, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4787135720252991, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.4295596405863762, + "step": 1744, + "token_counts/after_target": 893.25, + "token_counts/after_think": 87.75, + "token_counts/before_target": 2252.0, + "token_counts/before_think": 1108.25 + }, + { + "avg_penalty/after_target": 2.6022156476974487, + "avg_penalty/after_think": 2.3967320919036865, + "avg_penalty/before_target": 0.33358465880155563, + "avg_penalty/before_think": 0.42100415378808975, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.0, + "completions/max_terminated_length": 646.0, + "completions/mean_length": 154.171875, + "completions/mean_terminated_length": 154.171875, + "completions/min_length": 40.75, + "completions/min_terminated_length": 40.75, + "epoch": 0.8725, + "grad_norm": 6.381731033325195, + "kl": 20.1015625, + "learning_rate": 9.816747359488632e-07, + "loss": 1.9253, + "num_tokens": 49604867.0, + "reward": 1.6796875, + "reward_std": 0.7294385135173798, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38772592693567276, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.34967708587646484, + "step": 1745, + "token_counts/after_target": 527.5, + "token_counts/after_think": 25.25, + "token_counts/before_target": 988.75, + "token_counts/before_think": 925.25 + }, + { + "avg_penalty/after_target": 2.5974877327680588, + "avg_penalty/after_think": 3.431411921977997, + "avg_penalty/before_target": 0.37164418771862984, + "avg_penalty/before_think": 0.31473615765571594, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 685.0, + "completions/max_terminated_length": 567.75, + "completions/mean_length": 179.921875, + "completions/mean_terminated_length": 166.0708351135254, + "completions/min_length": 37.25, + "completions/min_terminated_length": 37.25, + "epoch": 0.873, + "grad_norm": 7.005917549133301, + "kl": 24.28125, + "learning_rate": 9.74147156501396e-07, + "loss": 1.9003, + "num_tokens": 49626270.0, + "reward": 1.6171875, + "reward_std": 0.8086050748825073, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.125, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4075859263539314, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.37325485050678253, + "step": 1746, + "token_counts/after_target": 454.5, + "token_counts/after_think": 44.75, + "token_counts/before_target": 1487.25, + "token_counts/before_think": 892.25 + }, + { + "avg_penalty/after_target": 2.223324567079544, + "avg_penalty/after_think": 2.404196619987488, + "avg_penalty/before_target": 0.3860529288649559, + "avg_penalty/before_think": 0.35677991062402725, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.75, + "completions/max_terminated_length": 543.75, + "completions/mean_length": 170.90625, + "completions/mean_terminated_length": 170.90625, + "completions/min_length": 41.75, + "completions/min_terminated_length": 41.75, + "epoch": 0.8735, + "grad_norm": 3.3510544300079346, + "kl": 19.46875, + "learning_rate": 9.666470713669918e-07, + "loss": 1.7577, + "num_tokens": 49647032.0, + "reward": 1.62109375, + "reward_std": 0.7595148980617523, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.3597148507833481, + "step": 1747, + "token_counts/after_target": 434.75, + "token_counts/after_think": 21.0, + "token_counts/before_target": 1571.75, + "token_counts/before_think": 707.0 + }, + { + "avg_penalty/after_target": 2.734668791294098, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3906131014227867, + "avg_penalty/before_think": 0.4056458920240402, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.5, + "completions/max_terminated_length": 526.5, + "completions/mean_length": 197.109375, + "completions/mean_terminated_length": 197.109375, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.874, + "grad_norm": 3.7647438049316406, + "kl": 15.005859375, + "learning_rate": 9.591745033922173e-07, + "loss": 1.4337, + "num_tokens": 49671023.0, + "reward": 1.6875, + "reward_std": 0.554185301065445, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.29930340498685837, + "rewards/tag_count_reward/mean": 0.859375, + "rewards/tag_count_reward/std": 0.2684899643063545, + "step": 1748, + "token_counts/after_target": 573.75, + "token_counts/after_think": 80.25, + "token_counts/before_target": 1764.5, + "token_counts/before_think": 735.25 + }, + { + "avg_penalty/after_target": 3.060553014278412, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.31064851209521294, + "avg_penalty/before_think": 0.37254442274570465, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.5, + "completions/max_terminated_length": 444.5, + "completions/mean_length": 164.96875, + "completions/mean_terminated_length": 164.96875, + "completions/min_length": 48.75, + "completions/min_terminated_length": 48.75, + "epoch": 0.8745, + "grad_norm": 3.722499132156372, + "kl": 17.09375, + "learning_rate": 9.517294753398066e-07, + "loss": 1.5083, + "num_tokens": 49692317.0, + "reward": 1.625, + "reward_std": 0.7108798325061798, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4075859263539314, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.32478949427604675, + "step": 1749, + "token_counts/after_target": 399.0, + "token_counts/after_think": 40.75, + "token_counts/before_target": 1444.75, + "token_counts/before_think": 755.0 + }, + { + "avg_penalty/after_target": 2.8665901124477386, + "avg_penalty/after_think": 2.9242321848869324, + "avg_penalty/before_target": 0.6265478208661079, + "avg_penalty/before_think": 0.8028643056750298, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.25, + "completions/max_terminated_length": 673.25, + "completions/mean_length": 211.890625, + "completions/mean_terminated_length": 211.890625, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.875, + "grad_norm": 5.814980983734131, + "kl": 29.9375, + "learning_rate": 9.44312009888606e-07, + "loss": 2.7136, + "num_tokens": 49716054.0, + "reward": 1.52734375, + "reward_std": 0.8111541271209717, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44938503205776215, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.3750772476196289, + "step": 1750, + "token_counts/after_target": 1056.5, + "token_counts/after_think": 72.0, + "token_counts/before_target": 1669.5, + "token_counts/before_think": 592.25 + }, + { + "avg_penalty/after_target": 2.4356769323349, + "avg_penalty/after_think": 2.741288125514984, + "avg_penalty/before_target": 0.3568481057882309, + "avg_penalty/before_think": 0.4168896712362766, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.5, + "completions/max_terminated_length": 462.5, + "completions/mean_length": 135.0, + "completions/mean_terminated_length": 135.0, + "completions/min_length": 35.75, + "completions/min_terminated_length": 35.75, + "epoch": 0.8755, + "grad_norm": 3.902421712875366, + "kl": 18.15625, + "learning_rate": 9.369221296335007e-07, + "loss": 1.487, + "num_tokens": 49733670.0, + "reward": 1.5703125, + "reward_std": 0.7980249524116516, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4255262687802315, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.38594041019678116, + "step": 1751, + "token_counts/after_target": 298.0, + "token_counts/after_think": 18.0, + "token_counts/before_target": 1169.5, + "token_counts/before_think": 674.5 + }, + { + "avg_penalty/after_target": 3.0275211334228516, + "avg_penalty/after_think": 2.824990451335907, + "avg_penalty/before_target": 0.5402299612760544, + "avg_penalty/before_think": 0.36910804361104965, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 621.25, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 238.125, + "completions/mean_terminated_length": 228.08646392822266, + "completions/min_length": 59.75, + "completions/min_terminated_length": 59.75, + "epoch": 0.876, + "grad_norm": 3.5316624641418457, + "kl": 22.515625, + "learning_rate": 9.295598570853514e-07, + "loss": 1.983, + "num_tokens": 49761294.0, + "reward": 1.61328125, + "reward_std": 0.7133418172597885, + "rewards/accuracy_reward/mean": NaN, + "rewards/accuracy_reward/std": NaN, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.3925696536898613, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.3385891392827034, + "step": 1752, + "token_counts/after_target": 947.75, + "token_counts/after_think": 160.75, + "token_counts/before_target": 1675.75, + "token_counts/before_think": 1025.75 + }, + { + "avg_penalty/after_target": 1.4776664078235626, + "avg_penalty/after_think": 3.7119104862213135, + "avg_penalty/before_target": 0.5568764507770538, + "avg_penalty/before_think": 0.47363951802253723, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 655.5, + "completions/max_terminated_length": 655.5, + "completions/mean_length": 234.296875, + "completions/mean_terminated_length": 234.296875, + "completions/min_length": 54.75, + "completions/min_terminated_length": 54.75, + "epoch": 0.8765, + "grad_norm": 4.485888957977295, + "kl": 23.625, + "learning_rate": 9.222252146709143e-07, + "loss": 1.9221, + "num_tokens": 49786961.0, + "reward": 1.55859375, + "reward_std": 0.7864101827144623, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4308478757739067, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.3737919554114342, + "step": 1753, + "token_counts/after_target": 820.75, + "token_counts/after_think": 36.75, + "token_counts/before_target": 1717.25, + "token_counts/before_think": 1174.0 + }, + { + "avg_penalty/after_target": 2.5289426147937775, + "avg_penalty/after_think": 3.6137737035751343, + "avg_penalty/before_target": 0.44800030440092087, + "avg_penalty/before_think": 0.5566434562206268, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 630.25, + "completions/max_terminated_length": 630.25, + "completions/mean_length": 202.53125, + "completions/mean_terminated_length": 202.53125, + "completions/min_length": 33.5, + "completions/min_terminated_length": 33.5, + "epoch": 0.877, + "grad_norm": 4.480983734130859, + "kl": 16.375, + "learning_rate": 9.149182247327837e-07, + "loss": 1.6183, + "num_tokens": 49813475.0, + "reward": 1.6953125, + "reward_std": 0.6733722388744354, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.3758598491549492, + "rewards/tag_count_reward/mean": 0.8671875, + "rewards/tag_count_reward/std": 0.31941793113946915, + "step": 1754, + "token_counts/after_target": 640.25, + "token_counts/after_think": 107.5, + "token_counts/before_target": 1273.0, + "token_counts/before_think": 1219.75 + }, + { + "avg_penalty/after_target": 3.222856342792511, + "avg_penalty/after_think": 2.4796989262104034, + "avg_penalty/before_target": 0.30962124839425087, + "avg_penalty/before_think": 0.5328884497284889, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 751.25, + "completions/max_terminated_length": 633.25, + "completions/mean_length": 246.265625, + "completions/mean_terminated_length": 235.51145935058594, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.8775, + "grad_norm": 4.90346097946167, + "kl": 22.953125, + "learning_rate": 9.076389095293148e-07, + "loss": 1.9035, + "num_tokens": 49838100.0, + "reward": 1.53515625, + "reward_std": 0.8167071789503098, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4308478757739067, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.3968895897269249, + "step": 1755, + "token_counts/after_target": 962.5, + "token_counts/after_think": 49.5, + "token_counts/before_target": 2131.5, + "token_counts/before_think": 796.75 + }, + { + "avg_penalty/after_target": 2.740447461605072, + "avg_penalty/after_think": 3.5809483528137207, + "avg_penalty/before_target": 0.48960762843489647, + "avg_penalty/before_think": 0.5370688177645206, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 724.5, + "completions/max_terminated_length": 666.0, + "completions/mean_length": 238.1875, + "completions/mean_terminated_length": 227.2750015258789, + "completions/min_length": 49.75, + "completions/min_terminated_length": 49.75, + "epoch": 0.878, + "grad_norm": 3.627807378768921, + "kl": 28.5625, + "learning_rate": 9.00387291234569e-07, + "loss": 2.3523, + "num_tokens": 49866064.0, + "reward": 1.48046875, + "reward_std": 0.8401400446891785, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45247192680835724, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.4003024846315384, + "step": 1756, + "token_counts/after_target": 1142.25, + "token_counts/after_think": 64.25, + "token_counts/before_target": 1952.0, + "token_counts/before_think": 652.5 + }, + { + "avg_penalty/after_target": 2.7210921347141266, + "avg_penalty/after_think": 3.7393094897270203, + "avg_penalty/before_target": 0.31174323335289955, + "avg_penalty/before_think": 0.4629008024930954, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.5, + "completions/max_terminated_length": 498.5, + "completions/mean_length": 177.375, + "completions/mean_terminated_length": 177.375, + "completions/min_length": 47.25, + "completions/min_terminated_length": 47.25, + "epoch": 0.8785, + "grad_norm": 4.180509090423584, + "kl": 20.796875, + "learning_rate": 8.931633919382299e-07, + "loss": 1.7595, + "num_tokens": 49888552.0, + "reward": 1.63671875, + "reward_std": 0.7395423203706741, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.37937305867671967, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.36226969957351685, + "step": 1757, + "token_counts/after_target": 425.75, + "token_counts/after_think": 50.5, + "token_counts/before_target": 1362.25, + "token_counts/before_think": 999.5 + }, + { + "avg_penalty/after_target": 2.581716001033783, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.4750814586877823, + "avg_penalty/before_think": 0.29882562905550003, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.5, + "completions/max_terminated_length": 541.5, + "completions/mean_length": 176.78125, + "completions/mean_terminated_length": 176.78125, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.879, + "grad_norm": 5.532930850982666, + "kl": 22.71875, + "learning_rate": 8.859672336455471e-07, + "loss": 2.0533, + "num_tokens": 49909002.0, + "reward": 1.62890625, + "reward_std": 0.7622987031936646, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4097762927412987, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.3612581267952919, + "step": 1758, + "token_counts/after_target": 532.0, + "token_counts/after_think": 33.75, + "token_counts/before_target": 1206.25, + "token_counts/before_think": 1056.5 + }, + { + "avg_penalty/after_target": 2.4625017642974854, + "avg_penalty/after_think": 2.698945462703705, + "avg_penalty/before_target": 0.35320496559143066, + "avg_penalty/before_think": 0.40660329908132553, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 195.671875, + "completions/mean_terminated_length": 195.671875, + "completions/min_length": 56.75, + "completions/min_terminated_length": 56.75, + "epoch": 0.8795, + "grad_norm": 3.984063148498535, + "kl": 25.28125, + "learning_rate": 8.787988382772705e-07, + "loss": 2.0783, + "num_tokens": 49929429.0, + "reward": 1.59765625, + "reward_std": 0.7822378575801849, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.42516325414180756, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.36540672928094864, + "step": 1759, + "token_counts/after_target": 570.75, + "token_counts/after_think": 19.5, + "token_counts/before_target": 1720.5, + "token_counts/before_think": 820.0 + }, + { + "avg_penalty/after_target": 3.0723297595977783, + "avg_penalty/after_think": 3.2998134791851044, + "avg_penalty/before_target": 0.46616964042186737, + "avg_penalty/before_think": 0.5851802006363869, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 701.5, + "completions/max_terminated_length": 701.5, + "completions/mean_length": 192.328125, + "completions/mean_terminated_length": 192.328125, + "completions/min_length": 45.5, + "completions/min_terminated_length": 45.5, + "epoch": 0.88, + "grad_norm": 13.014607429504395, + "kl": 23.0, + "learning_rate": 8.716582276695729e-07, + "loss": 2.4117, + "num_tokens": 49949530.0, + "reward": 1.63671875, + "reward_std": 0.7417902946472168, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.83984375, + "rewards/tag_count_reward/std": 0.3512769341468811, + "step": 1760, + "token_counts/after_target": 703.5, + "token_counts/after_think": 133.5, + "token_counts/before_target": 1492.25, + "token_counts/before_think": 748.0 + }, + { + "avg_penalty/after_target": 2.6474353671073914, + "avg_penalty/after_think": 3.8731759786605835, + "avg_penalty/before_target": 0.3447229154407978, + "avg_penalty/before_think": 0.42421942204236984, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.25, + "completions/max_terminated_length": 475.25, + "completions/mean_length": 183.375, + "completions/mean_terminated_length": 183.375, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.8805, + "grad_norm": 2.9376721382141113, + "kl": 12.4921875, + "learning_rate": 8.645454235739903e-07, + "loss": 1.1904, + "num_tokens": 49970210.0, + "reward": 1.671875, + "reward_std": 0.7327363789081573, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38336414843797684, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.3517322689294815, + "step": 1761, + "token_counts/after_target": 374.0, + "token_counts/after_think": 144.75, + "token_counts/before_target": 1298.75, + "token_counts/before_think": 1116.5 + }, + { + "avg_penalty/after_target": 2.522163063287735, + "avg_penalty/after_think": 3.8141082525253296, + "avg_penalty/before_target": 0.5092176124453545, + "avg_penalty/before_think": 0.6195738911628723, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 685.25, + "completions/max_terminated_length": 629.25, + "completions/mean_length": 238.234375, + "completions/mean_terminated_length": 227.19583892822266, + "completions/min_length": 45.5, + "completions/min_terminated_length": 45.5, + "epoch": 0.881, + "grad_norm": 4.259902477264404, + "kl": 32.8125, + "learning_rate": 8.574604476573623e-07, + "loss": 2.7309, + "num_tokens": 49998545.0, + "reward": 1.44921875, + "reward_std": 0.8756774663925171, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4625816270709038, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.42350687831640244, + "step": 1762, + "token_counts/after_target": 994.25, + "token_counts/after_think": 124.25, + "token_counts/before_target": 1808.0, + "token_counts/before_think": 885.25 + }, + { + "avg_penalty/after_target": 2.941741496324539, + "avg_penalty/after_think": 3.6440204977989197, + "avg_penalty/before_target": 0.457406472414732, + "avg_penalty/before_think": 0.540184736251831, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 570.5, + "completions/max_terminated_length": 570.5, + "completions/mean_length": 175.59375, + "completions/mean_terminated_length": 175.59375, + "completions/min_length": 22.75, + "completions/min_terminated_length": 22.75, + "epoch": 0.8815, + "grad_norm": 3.5693931579589844, + "kl": 23.71875, + "learning_rate": 8.504033215017527e-07, + "loss": 2.0691, + "num_tokens": 50020919.0, + "reward": 1.5546875, + "reward_std": 0.7880300730466843, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.43655145168304443, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.36812828481197357, + "step": 1763, + "token_counts/after_target": 594.5, + "token_counts/after_think": 35.0, + "token_counts/before_target": 1295.5, + "token_counts/before_think": 884.5 + }, + { + "avg_penalty/after_target": 1.6892869472503662, + "avg_penalty/after_think": 2.721760094165802, + "avg_penalty/before_target": 0.29114336147904396, + "avg_penalty/before_think": 0.4622979760169983, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.25, + "completions/max_terminated_length": 479.25, + "completions/mean_length": 140.5625, + "completions/mean_terminated_length": 140.5625, + "completions/min_length": 41.75, + "completions/min_terminated_length": 41.75, + "epoch": 0.882, + "grad_norm": 11.755843162536621, + "kl": 9.58203125, + "learning_rate": 8.433740666043899e-07, + "loss": 1.256, + "num_tokens": 50039659.0, + "reward": 1.8046875, + "reward_std": 0.5114246010780334, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.33406074345111847, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.183040589094162, + "step": 1764, + "token_counts/after_target": 223.0, + "token_counts/after_think": 24.5, + "token_counts/before_target": 1049.5, + "token_counts/before_think": 952.0 + }, + { + "avg_penalty/after_target": 2.600119411945343, + "avg_penalty/after_think": 3.889037609100342, + "avg_penalty/before_target": 0.41658276319503784, + "avg_penalty/before_think": 0.6864178329706192, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 787.75, + "completions/max_terminated_length": 687.5, + "completions/mean_length": 244.46875, + "completions/mean_terminated_length": 231.7125015258789, + "completions/min_length": 45.25, + "completions/min_terminated_length": 45.25, + "epoch": 0.8825, + "grad_norm": 3.862513780593872, + "kl": 23.046875, + "learning_rate": 8.363727043776037e-07, + "loss": 2.0183, + "num_tokens": 50066857.0, + "reward": 1.55859375, + "reward_std": 0.8114984035491943, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42867646366357803, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.39207465946674347, + "step": 1765, + "token_counts/after_target": 893.5, + "token_counts/after_think": 37.75, + "token_counts/before_target": 1847.5, + "token_counts/before_think": 1132.75 + }, + { + "avg_penalty/after_target": 2.356612503528595, + "avg_penalty/after_think": 3.8811310529708862, + "avg_penalty/before_target": 0.4126936085522175, + "avg_penalty/before_think": 0.6278814896941185, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 760.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 209.484375, + "completions/mean_terminated_length": 197.2052116394043, + "completions/min_length": 40.5, + "completions/min_terminated_length": 40.5, + "epoch": 0.883, + "grad_norm": 4.229369163513184, + "kl": 28.765625, + "learning_rate": 8.293992561487596e-07, + "loss": 2.3654, + "num_tokens": 50088216.0, + "reward": 1.5390625, + "reward_std": 0.8707127422094345, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4440634250640869, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.4162086099386215, + "step": 1766, + "token_counts/after_target": 596.75, + "token_counts/after_think": 148.0, + "token_counts/before_target": 1774.25, + "token_counts/before_think": 832.75 + }, + { + "avg_penalty/after_target": 2.3010602593421936, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.5451022274792194, + "avg_penalty/before_think": 0.5750671625137329, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 692.5, + "completions/max_terminated_length": 692.5, + "completions/mean_length": 178.78125, + "completions/mean_terminated_length": 178.78125, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.8835, + "grad_norm": 7.2824578285217285, + "kl": 25.71875, + "learning_rate": 8.224537431601886e-07, + "loss": 2.3216, + "num_tokens": 50110650.0, + "reward": 1.6328125, + "reward_std": 0.7367137670516968, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4097762927412987, + "rewards/tag_count_reward/mean": 0.8359375, + "rewards/tag_count_reward/std": 0.34600311517715454, + "step": 1767, + "token_counts/after_target": 674.0, + "token_counts/after_think": 105.75, + "token_counts/before_target": 1336.25, + "token_counts/before_think": 744.5 + }, + { + "avg_penalty/after_target": 2.2166904509067535, + "avg_penalty/after_think": 3.4130241572856903, + "avg_penalty/before_target": 0.3471752740442753, + "avg_penalty/before_think": 0.67571871727705, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.25, + "completions/max_terminated_length": 707.25, + "completions/mean_length": 192.640625, + "completions/mean_terminated_length": 192.640625, + "completions/min_length": 38.75, + "completions/min_terminated_length": 38.75, + "epoch": 0.884, + "grad_norm": 6.445265769958496, + "kl": 27.0, + "learning_rate": 8.155361865691291e-07, + "loss": 2.1516, + "num_tokens": 50133011.0, + "reward": 1.49609375, + "reward_std": 0.8463146239519119, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4519384130835533, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.41235826909542084, + "step": 1768, + "token_counts/after_target": 344.75, + "token_counts/after_think": 168.5, + "token_counts/before_target": 1644.25, + "token_counts/before_think": 924.75 + }, + { + "avg_penalty/after_target": 2.242672637104988, + "avg_penalty/after_think": 2.1588771045207977, + "avg_penalty/before_target": 0.5099894255399704, + "avg_penalty/before_think": 0.4343145564198494, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 758.75, + "completions/max_terminated_length": 551.0, + "completions/mean_length": 229.5, + "completions/mean_terminated_length": 203.3937530517578, + "completions/min_length": 50.25, + "completions/min_terminated_length": 50.25, + "epoch": 0.8845, + "grad_norm": 2.7538399696350098, + "kl": 25.28125, + "learning_rate": 8.086466074476562e-07, + "loss": 2.1688, + "num_tokens": 50156899.0, + "reward": 1.62109375, + "reward_std": 0.708758220076561, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4101393073797226, + "rewards/tag_count_reward/mean": 0.83984375, + "rewards/tag_count_reward/std": 0.31705373898148537, + "step": 1769, + "token_counts/after_target": 901.75, + "token_counts/after_think": 21.25, + "token_counts/before_target": 1931.75, + "token_counts/before_think": 817.25 + }, + { + "avg_penalty/after_target": 3.680331826210022, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.2939802184700966, + "avg_penalty/before_think": 0.3989397883415222, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.75, + "completions/max_terminated_length": 519.75, + "completions/mean_length": 199.71875, + "completions/mean_terminated_length": 199.71875, + "completions/min_length": 41.5, + "completions/min_terminated_length": 41.5, + "epoch": 0.885, + "grad_norm": 4.357487201690674, + "kl": 25.25, + "learning_rate": 8.017850267826233e-07, + "loss": 2.0789, + "num_tokens": 50178849.0, + "reward": 1.546875, + "reward_std": 0.7885768860578537, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4009781554341316, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.3770883232355118, + "step": 1770, + "token_counts/after_target": 652.75, + "token_counts/after_think": 30.25, + "token_counts/before_target": 1880.5, + "token_counts/before_think": 632.0 + }, + { + "avg_penalty/after_target": 2.6027021408081055, + "avg_penalty/after_think": 2.8742886781692505, + "avg_penalty/before_target": 0.579589270055294, + "avg_penalty/before_think": 0.4963117390871048, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.75, + "completions/max_terminated_length": 645.75, + "completions/mean_length": 202.828125, + "completions/mean_terminated_length": 202.828125, + "completions/min_length": 49.25, + "completions/min_terminated_length": 49.25, + "epoch": 0.8855, + "grad_norm": 6.216944217681885, + "kl": 25.0625, + "learning_rate": 7.949514654755963e-07, + "loss": 2.2869, + "num_tokens": 50201622.0, + "reward": 1.56640625, + "reward_std": 0.7646530717611313, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.40316852182149887, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.36577853560447693, + "step": 1771, + "token_counts/after_target": 737.5, + "token_counts/after_think": 87.25, + "token_counts/before_target": 1365.75, + "token_counts/before_think": 1054.75 + }, + { + "avg_penalty/after_target": 2.2380520701408386, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.43706272915005684, + "avg_penalty/before_think": 0.6023718118667603, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 724.75, + "completions/max_terminated_length": 617.5, + "completions/mean_length": 195.296875, + "completions/mean_terminated_length": 182.34479522705078, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.886, + "grad_norm": 4.773584365844727, + "kl": 21.65625, + "learning_rate": 7.881459443427885e-07, + "loss": 1.922, + "num_tokens": 50224569.0, + "reward": 1.61328125, + "reward_std": 0.7638303339481354, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.422013059258461, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.36411139369010925, + "step": 1772, + "token_counts/after_target": 619.5, + "token_counts/after_think": 76.5, + "token_counts/before_target": 1469.0, + "token_counts/before_think": 959.75 + }, + { + "avg_penalty/after_target": 3.084192216396332, + "avg_penalty/after_think": 2.951093554496765, + "avg_penalty/before_target": 0.31011658534407616, + "avg_penalty/before_think": 0.4021632969379425, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 538.75, + "completions/max_terminated_length": 538.75, + "completions/mean_length": 179.296875, + "completions/mean_terminated_length": 179.296875, + "completions/min_length": 35.5, + "completions/min_terminated_length": 35.5, + "epoch": 0.8865, + "grad_norm": 4.733163833618164, + "kl": 20.390625, + "learning_rate": 7.81368484114996e-07, + "loss": 1.7278, + "num_tokens": 50248156.0, + "reward": 1.56640625, + "reward_std": 0.773873820900917, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4154609143733978, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.37546028196811676, + "step": 1773, + "token_counts/after_target": 512.25, + "token_counts/after_think": 33.5, + "token_counts/before_target": 1350.75, + "token_counts/before_think": 972.25 + }, + { + "avg_penalty/after_target": 2.4720773100852966, + "avg_penalty/after_think": 2.6593430042266846, + "avg_penalty/before_target": 0.3637946732342243, + "avg_penalty/before_think": 0.5467929616570473, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.0, + "completions/max_terminated_length": 616.0, + "completions/mean_length": 247.5625, + "completions/mean_terminated_length": 247.5625, + "completions/min_length": 38.25, + "completions/min_terminated_length": 38.25, + "epoch": 0.887, + "grad_norm": 2.7200841903686523, + "kl": 16.671875, + "learning_rate": 7.746191054375363e-07, + "loss": 1.421, + "num_tokens": 50273680.0, + "reward": 1.58203125, + "reward_std": 0.7334192097187042, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.41503459960222244, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.36008598655462265, + "step": 1774, + "token_counts/after_target": 771.25, + "token_counts/after_think": 55.25, + "token_counts/before_target": 1860.25, + "token_counts/before_think": 1274.25 + }, + { + "avg_penalty/after_target": 2.6355740427970886, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5186905823647976, + "avg_penalty/before_think": 0.39927922934293747, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 781.0, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 288.109375, + "completions/mean_terminated_length": 265.5698013305664, + "completions/min_length": 71.5, + "completions/min_terminated_length": 71.5, + "epoch": 0.8875, + "grad_norm": 5.460658073425293, + "kl": 23.890625, + "learning_rate": 7.678978288701911e-07, + "loss": 1.9091, + "num_tokens": 50303495.0, + "reward": 1.4296875, + "reward_std": 0.8038655817508698, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.46449070423841476, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.3724554404616356, + "step": 1775, + "token_counts/after_target": 1183.0, + "token_counts/after_think": 32.5, + "token_counts/before_target": 2196.5, + "token_counts/before_think": 1197.75 + }, + { + "avg_penalty/after_target": 2.8239684402942657, + "avg_penalty/after_think": 2.786672532558441, + "avg_penalty/before_target": 0.3729798458516598, + "avg_penalty/before_think": 0.3934687674045563, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 667.75, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 182.25, + "completions/mean_terminated_length": 169.16250228881836, + "completions/min_length": 57.25, + "completions/min_terminated_length": 57.25, + "epoch": 0.888, + "grad_norm": 6.406950950622559, + "kl": 21.8125, + "learning_rate": 7.612046748871327e-07, + "loss": 2.086, + "num_tokens": 50326647.0, + "reward": 1.6484375, + "reward_std": 0.7885985672473907, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.3979102149605751, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.36136677861213684, + "step": 1776, + "token_counts/after_target": 665.5, + "token_counts/after_think": 31.75, + "token_counts/before_target": 1413.0, + "token_counts/before_think": 805.75 + }, + { + "avg_penalty/after_target": 2.2574531733989716, + "avg_penalty/after_think": 3.8567915558815002, + "avg_penalty/before_target": 0.37334130704402924, + "avg_penalty/before_think": 0.4173416346311569, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.75, + "completions/max_terminated_length": 569.75, + "completions/mean_length": 174.015625, + "completions/mean_terminated_length": 174.015625, + "completions/min_length": 38.75, + "completions/min_terminated_length": 38.75, + "epoch": 0.8885, + "grad_norm": 8.560312271118164, + "kl": 9.65625, + "learning_rate": 7.545396638768698e-07, + "loss": 1.1476, + "num_tokens": 50349240.0, + "reward": 1.84765625, + "reward_std": 0.6810061037540436, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.18616948276758194, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.356952004134655, + "rewards/tag_count_reward/mean": 0.91015625, + "rewards/tag_count_reward/std": 0.25602760910987854, + "step": 1777, + "token_counts/after_target": 461.5, + "token_counts/after_think": 51.5, + "token_counts/before_target": 1339.0, + "token_counts/before_think": 932.25 + }, + { + "avg_penalty/after_target": 2.9682296812534332, + "avg_penalty/after_think": 2.3647477626800537, + "avg_penalty/before_target": 0.36707209050655365, + "avg_penalty/before_think": 0.5638592094182968, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 217.953125, + "completions/mean_terminated_length": 217.953125, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.889, + "grad_norm": 5.347495079040527, + "kl": 24.078125, + "learning_rate": 7.479028161421798e-07, + "loss": 2.1571, + "num_tokens": 50374149.0, + "reward": 1.49609375, + "reward_std": 0.8185160160064697, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45247192680835724, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.3803015500307083, + "step": 1778, + "token_counts/after_target": 880.0, + "token_counts/after_think": 182.75, + "token_counts/before_target": 1750.5, + "token_counts/before_think": 674.0 + }, + { + "avg_penalty/after_target": 2.635670244693756, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3505037985742092, + "avg_penalty/before_think": 0.41933348029851913, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 527.25, + "completions/max_terminated_length": 527.25, + "completions/mean_length": 178.15625, + "completions/mean_terminated_length": 178.15625, + "completions/min_length": 35.25, + "completions/min_terminated_length": 35.25, + "epoch": 0.8895, + "grad_norm": 4.194962501525879, + "kl": 20.40625, + "learning_rate": 7.412941519000527e-07, + "loss": 1.6721, + "num_tokens": 50397151.0, + "reward": 1.51953125, + "reward_std": 0.8265768140554428, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4260597825050354, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.4074820727109909, + "step": 1779, + "token_counts/after_target": 470.75, + "token_counts/after_think": 36.75, + "token_counts/before_target": 1772.5, + "token_counts/before_think": 570.5 + }, + { + "avg_penalty/after_target": 2.447664886713028, + "avg_penalty/after_think": 3.678511679172516, + "avg_penalty/before_target": 0.5413774773478508, + "avg_penalty/before_think": 0.6092538386583328, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 808.0, + "completions/max_terminated_length": 675.25, + "completions/mean_length": 253.84375, + "completions/mean_terminated_length": 240.88229370117188, + "completions/min_length": 44.5, + "completions/min_terminated_length": 44.5, + "epoch": 0.89, + "grad_norm": 5.852245807647705, + "kl": 21.078125, + "learning_rate": 7.347136912816277e-07, + "loss": 2.0083, + "num_tokens": 50429285.0, + "reward": 1.66015625, + "reward_std": 0.698460265994072, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.37675637751817703, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.31376326084136963, + "step": 1780, + "token_counts/after_target": 896.5, + "token_counts/after_think": 113.25, + "token_counts/before_target": 2098.0, + "token_counts/before_think": 953.75 + }, + { + "avg_penalty/after_target": 3.0993547439575195, + "avg_penalty/after_think": 3.7649786472320557, + "avg_penalty/before_target": 0.44650042802095413, + "avg_penalty/before_think": 0.3256109729409218, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 522.75, + "completions/max_terminated_length": 460.5, + "completions/mean_length": 161.25, + "completions/mean_terminated_length": 149.1125030517578, + "completions/min_length": 45.75, + "completions/min_terminated_length": 45.75, + "epoch": 0.8905, + "grad_norm": 6.429171562194824, + "kl": 18.5107421875, + "learning_rate": 7.281614543321269e-07, + "loss": 1.8413, + "num_tokens": 50448501.0, + "reward": 1.703125, + "reward_std": 0.5518823713064194, + "rewards/accuracy_reward/mean": NaN, + "rewards/accuracy_reward/std": NaN, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.31687305867671967, + "rewards/tag_count_reward/mean": 0.875, + "rewards/tag_count_reward/std": 0.24512263759970665, + "step": 1781, + "token_counts/after_target": 616.75, + "token_counts/after_think": 33.25, + "token_counts/before_target": 1129.0, + "token_counts/before_think": 801.0 + }, + { + "avg_penalty/after_target": 2.1948742270469666, + "avg_penalty/after_think": 2.802435874938965, + "avg_penalty/before_target": 0.49063707143068314, + "avg_penalty/before_think": 0.4405870735645294, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 762.25, + "completions/max_terminated_length": 762.25, + "completions/mean_length": 237.5625, + "completions/mean_terminated_length": 237.5625, + "completions/min_length": 47.5, + "completions/min_terminated_length": 47.5, + "epoch": 0.891, + "grad_norm": 3.61672306060791, + "kl": 26.65625, + "learning_rate": 7.216374610108012e-07, + "loss": 2.2227, + "num_tokens": 50475017.0, + "reward": 1.60546875, + "reward_std": 0.755373403429985, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.39476002007722855, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.36480943858623505, + "step": 1782, + "token_counts/after_target": 858.25, + "token_counts/after_think": 7.0, + "token_counts/before_target": 1976.25, + "token_counts/before_think": 959.5 + }, + { + "avg_penalty/after_target": 2.1351656913757324, + "avg_penalty/after_think": 2.982651472091675, + "avg_penalty/before_target": 0.37742625176906586, + "avg_penalty/before_think": 0.5357038229703903, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.25, + "completions/max_terminated_length": 530.25, + "completions/mean_length": 196.640625, + "completions/mean_terminated_length": 196.640625, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.8915, + "grad_norm": 7.449787616729736, + "kl": 18.625, + "learning_rate": 7.151417311908648e-07, + "loss": 1.4576, + "num_tokens": 50496722.0, + "reward": 1.58203125, + "reward_std": 0.7736530154943466, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4260597825050354, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.3646627217531204, + "step": 1783, + "token_counts/after_target": 417.25, + "token_counts/after_think": 66.5, + "token_counts/before_target": 1772.75, + "token_counts/before_think": 889.75 + }, + { + "avg_penalty/after_target": 1.9761373102664948, + "avg_penalty/after_think": 2.6617228984832764, + "avg_penalty/before_target": 0.3378157615661621, + "avg_penalty/before_think": 0.39162012189626694, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.5, + "completions/max_terminated_length": 355.5, + "completions/mean_length": 134.21875, + "completions/mean_terminated_length": 134.21875, + "completions/min_length": 39.5, + "completions/min_terminated_length": 39.5, + "epoch": 0.892, + "grad_norm": 4.332816123962402, + "kl": 18.357421875, + "learning_rate": 7.086742846594385e-07, + "loss": 1.5749, + "num_tokens": 50514400.0, + "reward": 1.6328125, + "reward_std": 0.726948469877243, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.37366948276758194, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.35373854637145996, + "step": 1784, + "token_counts/after_target": 215.25, + "token_counts/after_think": 83.75, + "token_counts/before_target": 1199.25, + "token_counts/before_think": 649.25 + }, + { + "avg_penalty/after_target": 2.832911729812622, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.39122970402240753, + "avg_penalty/before_think": 0.5488224029541016, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.5, + "completions/max_terminated_length": 536.5, + "completions/mean_length": 219.96875, + "completions/mean_terminated_length": 219.96875, + "completions/min_length": 43.75, + "completions/min_terminated_length": 43.75, + "epoch": 0.8925, + "grad_norm": 3.3602559566497803, + "kl": 21.46875, + "learning_rate": 7.022351411174866e-07, + "loss": 1.8202, + "num_tokens": 50538974.0, + "reward": 1.51953125, + "reward_std": 0.8176388442516327, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4462348371744156, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.3893853425979614, + "step": 1785, + "token_counts/after_target": 716.75, + "token_counts/after_think": 99.25, + "token_counts/before_target": 1772.5, + "token_counts/before_think": 931.0 + }, + { + "avg_penalty/after_target": 2.246225267648697, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3433251641690731, + "avg_penalty/before_think": 0.6616450846195221, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 588.0, + "completions/max_terminated_length": 588.0, + "completions/mean_length": 209.234375, + "completions/mean_terminated_length": 209.234375, + "completions/min_length": 48.75, + "completions/min_terminated_length": 48.75, + "epoch": 0.893, + "grad_norm": 4.450282096862793, + "kl": 21.375, + "learning_rate": 6.958243201797554e-07, + "loss": 1.7903, + "num_tokens": 50564557.0, + "reward": 1.49609375, + "reward_std": 0.8480512499809265, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4519384130835533, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.40504419803619385, + "step": 1786, + "token_counts/after_target": 573.0, + "token_counts/after_think": 90.5, + "token_counts/before_target": 1646.25, + "token_counts/before_think": 1038.0 + }, + { + "avg_penalty/after_target": 2.8936530351638794, + "avg_penalty/after_think": 1.6858401894569397, + "avg_penalty/before_target": 0.3202616386115551, + "avg_penalty/before_think": 0.449079193174839, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.25, + "completions/max_terminated_length": 503.25, + "completions/mean_length": 210.1875, + "completions/mean_terminated_length": 210.1875, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.8935, + "grad_norm": 3.0767509937286377, + "kl": 19.34375, + "learning_rate": 6.894418413747183e-07, + "loss": 1.6201, + "num_tokens": 50586073.0, + "reward": 1.66796875, + "reward_std": 0.6956785768270493, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3943893313407898, + "rewards/tag_count_reward/mean": 0.85546875, + "rewards/tag_count_reward/std": 0.32237808406352997, + "step": 1787, + "token_counts/after_target": 476.75, + "token_counts/after_think": 23.0, + "token_counts/before_target": 1454.75, + "token_counts/before_think": 1408.5 + }, + { + "avg_penalty/after_target": 2.443628042936325, + "avg_penalty/after_think": 2.5983026027679443, + "avg_penalty/before_target": 0.36255136877298355, + "avg_penalty/before_think": 0.4998033940792084, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.75, + "completions/max_terminated_length": 569.75, + "completions/mean_length": 182.890625, + "completions/mean_terminated_length": 182.890625, + "completions/min_length": 44.25, + "completions/min_terminated_length": 44.25, + "epoch": 0.894, + "grad_norm": 2.5236546993255615, + "kl": 14.21875, + "learning_rate": 6.83087724144511e-07, + "loss": 1.2719, + "num_tokens": 50607602.0, + "reward": 1.75, + "reward_std": 0.744251161813736, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36797718703746796, + "rewards/tag_count_reward/mean": 0.859375, + "rewards/tag_count_reward/std": 0.3399307131767273, + "step": 1788, + "token_counts/after_target": 453.0, + "token_counts/after_think": 27.5, + "token_counts/before_target": 1493.25, + "token_counts/before_think": 952.5 + }, + { + "avg_penalty/after_target": 2.0143097043037415, + "avg_penalty/after_think": 3.8106712102890015, + "avg_penalty/before_target": 0.2516200430691242, + "avg_penalty/before_think": 0.4884750694036484, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.25, + "completions/max_terminated_length": 556.25, + "completions/mean_length": 206.21875, + "completions/mean_terminated_length": 206.21875, + "completions/min_length": 30.25, + "completions/min_terminated_length": 30.25, + "epoch": 0.8945, + "grad_norm": 15.48936939239502, + "kl": 25.078125, + "learning_rate": 6.767619878448783e-07, + "loss": 1.6408, + "num_tokens": 50634032.0, + "reward": 1.49609375, + "reward_std": 0.887768417596817, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.14789126068353653, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.46034691482782364, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.4137103483080864, + "step": 1789, + "token_counts/after_target": 324.75, + "token_counts/after_think": 78.25, + "token_counts/before_target": 1964.25, + "token_counts/before_think": 932.25 + }, + { + "avg_penalty/after_target": 2.6141828298568726, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.437204547226429, + "avg_penalty/before_think": 0.4107122905552387, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.5, + "completions/max_terminated_length": 522.5, + "completions/mean_length": 205.234375, + "completions/mean_terminated_length": 205.234375, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.895, + "grad_norm": 4.789834499359131, + "kl": 14.9296875, + "learning_rate": 6.704646517451108e-07, + "loss": 1.4642, + "num_tokens": 50656703.0, + "reward": 1.62890625, + "reward_std": 0.6677917093038559, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.36967839300632477, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.31015437468886375, + "step": 1790, + "token_counts/after_target": 744.0, + "token_counts/after_think": 21.25, + "token_counts/before_target": 1552.25, + "token_counts/before_think": 966.25 + }, + { + "avg_penalty/after_target": 2.488916367292404, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4454956576228142, + "avg_penalty/before_think": 0.40033793821930885, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.5, + "completions/max_terminated_length": 651.5, + "completions/mean_length": 242.0, + "completions/mean_terminated_length": 242.0, + "completions/min_length": 50.5, + "completions/min_terminated_length": 50.5, + "epoch": 0.8955, + "grad_norm": 5.9631733894348145, + "kl": 26.09375, + "learning_rate": 6.641957350279838e-07, + "loss": 2.0682, + "num_tokens": 50681295.0, + "reward": 1.44921875, + "reward_std": 0.8519698977470398, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.46034691482782364, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.40366389602422714, + "step": 1791, + "token_counts/after_target": 760.75, + "token_counts/after_think": 108.0, + "token_counts/before_target": 1717.75, + "token_counts/before_think": 1285.5 + }, + { + "avg_penalty/after_target": 3.2137425541877747, + "avg_penalty/after_think": 2.917621612548828, + "avg_penalty/before_target": 0.26732325926423073, + "avg_penalty/before_think": 0.39391863346099854, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.5, + "completions/max_terminated_length": 520.5, + "completions/mean_length": 146.5, + "completions/mean_terminated_length": 146.5, + "completions/min_length": 35.75, + "completions/min_terminated_length": 35.75, + "epoch": 0.896, + "grad_norm": 6.056423664093018, + "kl": 19.65625, + "learning_rate": 6.579552567897052e-07, + "loss": 1.8745, + "num_tokens": 50700255.0, + "reward": 1.6875, + "reward_std": 0.7059217989444733, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.3529609143733978, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.3529609143733978, + "step": 1792, + "token_counts/after_target": 133.0, + "token_counts/after_think": 187.0, + "token_counts/before_target": 1242.5, + "token_counts/before_think": 781.5 + }, + { + "avg_penalty/after_target": 2.685564935207367, + "avg_penalty/after_think": 2.801143169403076, + "avg_penalty/before_target": 0.4913434498012066, + "avg_penalty/before_think": 0.41072334349155426, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 681.25, + "completions/max_terminated_length": 567.25, + "completions/mean_length": 258.234375, + "completions/mean_terminated_length": 247.6072998046875, + "completions/min_length": 50.5, + "completions/min_terminated_length": 50.5, + "epoch": 0.8965, + "grad_norm": 6.882413387298584, + "kl": 24.28125, + "learning_rate": 6.517432360398556e-07, + "loss": 1.928, + "num_tokens": 50728414.0, + "reward": 1.4296875, + "reward_std": 0.8697933554649353, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.45028156042099, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.43262410908937454, + "step": 1793, + "token_counts/after_target": 1064.0, + "token_counts/after_think": 38.5, + "token_counts/before_target": 1439.75, + "token_counts/before_think": 1589.5 + }, + { + "avg_penalty/after_target": 2.6200798749923706, + "avg_penalty/after_think": 3.8303584456443787, + "avg_penalty/before_target": 0.4091082140803337, + "avg_penalty/before_think": 0.6414746828377247, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 649.75, + "completions/max_terminated_length": 576.5, + "completions/mean_length": 270.390625, + "completions/mean_terminated_length": 259.7375030517578, + "completions/min_length": 59.75, + "completions/min_terminated_length": 59.75, + "epoch": 0.897, + "grad_norm": 3.3046300411224365, + "kl": 19.171875, + "learning_rate": 6.455596917013274e-07, + "loss": 1.7543, + "num_tokens": 50755015.0, + "reward": 1.5078125, + "reward_std": 0.7771517187356949, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.41520625352859497, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.3704495131969452, + "step": 1794, + "token_counts/after_target": 776.75, + "token_counts/after_think": 266.75, + "token_counts/before_target": 2111.0, + "token_counts/before_think": 1171.75 + }, + { + "avg_penalty/after_target": 2.1555221676826477, + "avg_penalty/after_think": 3.417878568172455, + "avg_penalty/before_target": 0.4395500272512436, + "avg_penalty/before_think": 0.5589036643505096, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 635.5, + "completions/max_terminated_length": 635.5, + "completions/mean_length": 207.1875, + "completions/mean_terminated_length": 207.1875, + "completions/min_length": 53.5, + "completions/min_terminated_length": 53.5, + "epoch": 0.8975, + "grad_norm": 9.547971725463867, + "kl": 17.25, + "learning_rate": 6.394046426102673e-07, + "loss": 1.7773, + "num_tokens": 50776499.0, + "reward": 1.66796875, + "reward_std": 0.6773079931735992, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.11180340498685837, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.3811737820506096, + "rewards/tag_count_reward/mean": 0.85546875, + "rewards/tag_count_reward/std": 0.2737437430769205, + "step": 1795, + "token_counts/after_target": 751.0, + "token_counts/after_think": 42.25, + "token_counts/before_target": 1471.0, + "token_counts/before_think": 1050.75 + }, + { + "avg_penalty/after_target": 2.554079234600067, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.394951943308115, + "avg_penalty/before_think": 0.5838696509599686, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 674.5, + "completions/max_terminated_length": 635.25, + "completions/mean_length": 212.4375, + "completions/mean_terminated_length": 200.07291793823242, + "completions/min_length": 53.75, + "completions/min_terminated_length": 53.75, + "epoch": 0.898, + "grad_norm": 9.12779712677002, + "kl": 16.0859375, + "learning_rate": 6.332781075160244e-07, + "loss": 1.6831, + "num_tokens": 50802399.0, + "reward": 1.66796875, + "reward_std": 0.649172455072403, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.36435678601264954, + "rewards/tag_count_reward/mean": 0.85546875, + "rewards/tag_count_reward/std": 0.30173180997371674, + "step": 1796, + "token_counts/after_target": 779.25, + "token_counts/after_think": 80.75, + "token_counts/before_target": 1460.75, + "token_counts/before_think": 1078.25 + }, + { + "avg_penalty/after_target": 2.579803913831711, + "avg_penalty/after_think": 2.8813485503196716, + "avg_penalty/before_target": 0.32046566158533096, + "avg_penalty/before_think": 0.4760473072528839, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.5, + "completions/max_terminated_length": 498.5, + "completions/mean_length": 169.640625, + "completions/mean_terminated_length": 169.640625, + "completions/min_length": 37.25, + "completions/min_terminated_length": 37.25, + "epoch": 0.8985, + "grad_norm": 2.5740644931793213, + "kl": 13.0625, + "learning_rate": 6.271801050810856e-07, + "loss": 1.175, + "num_tokens": 50831656.0, + "reward": 1.68359375, + "reward_std": 0.7346638441085815, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.3683478757739067, + "rewards/tag_count_reward/mean": 0.83984375, + "rewards/tag_count_reward/std": 0.35051756352186203, + "step": 1797, + "token_counts/after_target": 364.25, + "token_counts/after_think": 57.0, + "token_counts/before_target": 1080.75, + "token_counts/before_think": 1212.25 + }, + { + "avg_penalty/after_target": 2.679727703332901, + "avg_penalty/after_think": 2.671001136302948, + "avg_penalty/before_target": 0.3213142640888691, + "avg_penalty/before_think": 0.513305626809597, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.75, + "completions/max_terminated_length": 403.75, + "completions/mean_length": 160.890625, + "completions/mean_terminated_length": 160.890625, + "completions/min_length": 39.25, + "completions/min_terminated_length": 39.25, + "epoch": 0.899, + "grad_norm": 2.737290859222412, + "kl": 15.75, + "learning_rate": 6.21110653881023e-07, + "loss": 1.3816, + "num_tokens": 50852369.0, + "reward": 1.6015625, + "reward_std": 0.7654021978378296, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.36113331466913223, + "step": 1798, + "token_counts/after_target": 309.5, + "token_counts/after_think": 41.25, + "token_counts/before_target": 1352.25, + "token_counts/before_think": 871.25 + }, + { + "avg_penalty/after_target": 2.4805906116962433, + "avg_penalty/after_think": 1.7592735290527344, + "avg_penalty/before_target": 0.49058134108781815, + "avg_penalty/before_think": 0.41347768157720566, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 791.5, + "completions/max_terminated_length": 546.75, + "completions/mean_length": 225.296875, + "completions/mean_terminated_length": 198.3781280517578, + "completions/min_length": 44.25, + "completions/min_terminated_length": 44.25, + "epoch": 0.8995, + "grad_norm": 5.618851184844971, + "kl": 27.59375, + "learning_rate": 6.150697724044407e-07, + "loss": 2.2377, + "num_tokens": 50881188.0, + "reward": 1.48046875, + "reward_std": 0.8542172014713287, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.40513405948877335, + "step": 1799, + "token_counts/after_target": 785.25, + "token_counts/after_think": 8.75, + "token_counts/before_target": 2007.5, + "token_counts/before_think": 803.25 + }, + { + "avg_penalty/after_target": 2.4678146839141846, + "avg_penalty/after_think": 3.835375964641571, + "avg_penalty/before_target": 0.38900747895240784, + "avg_penalty/before_think": 0.3004601299762726, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 540.75, + "completions/max_terminated_length": 498.25, + "completions/mean_length": 192.375, + "completions/mean_terminated_length": 180.65312957763672, + "completions/min_length": 48.25, + "completions/min_terminated_length": 48.25, + "epoch": 0.9, + "grad_norm": 10.571950912475586, + "kl": 25.125, + "learning_rate": 6.090574790529091e-07, + "loss": 1.8299, + "num_tokens": 50905564.0, + "reward": 1.49609375, + "reward_std": 0.8607681393623352, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44938503205776215, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.41771669685840607, + "step": 1800, + "token_counts/after_target": 427.5, + "token_counts/after_think": 21.5, + "token_counts/before_target": 1967.25, + "token_counts/before_think": 661.75 + }, + { + "avg_penalty/after_target": 2.361494928598404, + "avg_penalty/after_think": 3.10040619969368, + "avg_penalty/before_target": 0.334563422948122, + "avg_penalty/before_think": 0.5978033915162086, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 713.0, + "completions/max_terminated_length": 612.5, + "completions/mean_length": 252.90625, + "completions/mean_terminated_length": 240.4718780517578, + "completions/min_length": 44.5, + "completions/min_terminated_length": 44.5, + "epoch": 0.9005, + "grad_norm": 7.811835289001465, + "kl": 22.65625, + "learning_rate": 6.030737921409169e-07, + "loss": 1.706, + "num_tokens": 50930806.0, + "reward": 1.40625, + "reward_std": 0.844340980052948, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.48935678601264954, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.39574137330055237, + "step": 1801, + "token_counts/after_target": 637.5, + "token_counts/after_think": 64.25, + "token_counts/before_target": 2405.0, + "token_counts/before_think": 939.75 + }, + { + "avg_penalty/after_target": 2.9620308876037598, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.33452652394771576, + "avg_penalty/before_think": 0.38633936271071434, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.0, + "completions/max_terminated_length": 567.0, + "completions/mean_length": 170.140625, + "completions/mean_terminated_length": 170.140625, + "completions/min_length": 48.75, + "completions/min_terminated_length": 48.75, + "epoch": 0.901, + "grad_norm": 4.733696937561035, + "kl": 21.140625, + "learning_rate": 5.971187298958103e-07, + "loss": 1.9126, + "num_tokens": 50953967.0, + "reward": 1.63671875, + "reward_std": 0.7486619353294373, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.83984375, + "rewards/tag_count_reward/std": 0.3536282479763031, + "step": 1802, + "token_counts/after_target": 482.0, + "token_counts/after_think": 18.75, + "token_counts/before_target": 1364.75, + "token_counts/before_think": 856.75 + }, + { + "avg_penalty/after_target": 2.6326156854629517, + "avg_penalty/after_think": 3.7304734587669373, + "avg_penalty/before_target": 0.4670513868331909, + "avg_penalty/before_think": 0.5014496445655823, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.25, + "completions/max_terminated_length": 682.25, + "completions/mean_length": 174.640625, + "completions/mean_terminated_length": 174.640625, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.9015, + "grad_norm": 4.7969441413879395, + "kl": 24.09375, + "learning_rate": 5.911923104577455e-07, + "loss": 2.0722, + "num_tokens": 50976296.0, + "reward": 1.51171875, + "reward_std": 0.8201775550842285, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44442643970251083, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.39707323908805847, + "step": 1803, + "token_counts/after_target": 380.25, + "token_counts/after_think": 86.5, + "token_counts/before_target": 1551.0, + "token_counts/before_think": 776.5 + }, + { + "avg_penalty/after_target": 2.287112683057785, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.33962811529636383, + "avg_penalty/before_think": 0.3807784169912338, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 185.640625, + "completions/mean_terminated_length": 185.640625, + "completions/min_length": 53.25, + "completions/min_terminated_length": 53.25, + "epoch": 0.902, + "grad_norm": 2.8160006999969482, + "kl": 18.640625, + "learning_rate": 5.852945518796205e-07, + "loss": 1.5973, + "num_tokens": 50995121.0, + "reward": 1.5390625, + "reward_std": 0.7901882976293564, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44938503205776215, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.36630847305059433, + "step": 1804, + "token_counts/after_target": 297.5, + "token_counts/after_think": 166.5, + "token_counts/before_target": 1737.5, + "token_counts/before_think": 768.75 + }, + { + "avg_penalty/after_target": 2.131279796361923, + "avg_penalty/after_think": 3.847874939441681, + "avg_penalty/before_target": 0.4863831251859665, + "avg_penalty/before_think": 0.39963649585843086, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.75, + "completions/max_terminated_length": 603.75, + "completions/mean_length": 197.15625, + "completions/mean_terminated_length": 197.15625, + "completions/min_length": 45.25, + "completions/min_terminated_length": 45.25, + "epoch": 0.9025, + "grad_norm": 4.156296730041504, + "kl": 21.640625, + "learning_rate": 5.794254721270331e-07, + "loss": 1.8083, + "num_tokens": 51020443.0, + "reward": 1.5859375, + "reward_std": 0.7759943306446075, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.43303824216127396, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.3595755770802498, + "step": 1805, + "token_counts/after_target": 582.0, + "token_counts/after_think": 57.75, + "token_counts/before_target": 1529.5, + "token_counts/before_think": 985.25 + }, + { + "avg_penalty/after_target": 2.4314140677452087, + "avg_penalty/after_think": 3.6068819761276245, + "avg_penalty/before_target": 0.3024638183414936, + "avg_penalty/before_think": 0.488981656730175, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.75, + "completions/max_terminated_length": 484.75, + "completions/mean_length": 183.078125, + "completions/mean_terminated_length": 183.078125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.903, + "grad_norm": 5.9721550941467285, + "kl": 14.92578125, + "learning_rate": 5.735850890782158e-07, + "loss": 1.4673, + "num_tokens": 51046192.0, + "reward": 1.765625, + "reward_std": 0.6286230981349945, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.11180340498685837, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.35648179799318314, + "rewards/tag_count_reward/mean": 0.875, + "rewards/tag_count_reward/std": 0.2782370001077652, + "step": 1806, + "token_counts/after_target": 440.0, + "token_counts/after_think": 141.75, + "token_counts/before_target": 1246.0, + "token_counts/before_think": 1101.5 + }, + { + "avg_penalty/after_target": 2.1613783836364746, + "avg_penalty/after_think": 3.977950870990753, + "avg_penalty/before_target": 0.5502367317676544, + "avg_penalty/before_think": 0.41741780936717987, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 747.75, + "completions/max_terminated_length": 606.0, + "completions/mean_length": 227.03125, + "completions/mean_terminated_length": 214.7145881652832, + "completions/min_length": 39.5, + "completions/min_terminated_length": 39.5, + "epoch": 0.9035, + "grad_norm": 4.359194278717041, + "kl": 25.125, + "learning_rate": 5.677734205239904e-07, + "loss": 2.1982, + "num_tokens": 51069538.0, + "reward": 1.3984375, + "reward_std": 0.8569171130657196, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.48456869274377823, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.39403538405895233, + "step": 1807, + "token_counts/after_target": 819.25, + "token_counts/after_think": 82.25, + "token_counts/before_target": 1515.5, + "token_counts/before_think": 1215.5 + }, + { + "avg_penalty/after_target": 1.876112848520279, + "avg_penalty/after_think": 3.78531676530838, + "avg_penalty/before_target": 0.48500510305166245, + "avg_penalty/before_think": 0.48462191224098206, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 650.0, + "completions/max_terminated_length": 490.75, + "completions/mean_length": 191.625, + "completions/mean_terminated_length": 178.52291870117188, + "completions/min_length": 54.25, + "completions/min_terminated_length": 54.25, + "epoch": 0.904, + "grad_norm": 8.152308464050293, + "kl": 24.125, + "learning_rate": 5.619904841677059e-07, + "loss": 1.8444, + "num_tokens": 51090714.0, + "reward": 1.5078125, + "reward_std": 0.8566398322582245, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.45508860796689987, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.41742007434368134, + "step": 1808, + "token_counts/after_target": 464.5, + "token_counts/after_think": 76.5, + "token_counts/before_target": 1447.5, + "token_counts/before_think": 1077.5 + }, + { + "avg_penalty/after_target": 2.16938054561615, + "avg_penalty/after_think": 3.7860284447669983, + "avg_penalty/before_target": 0.379836343228817, + "avg_penalty/before_think": 0.46855173259973526, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 608.5, + "completions/max_terminated_length": 608.5, + "completions/mean_length": 231.0625, + "completions/mean_terminated_length": 231.0625, + "completions/min_length": 32.75, + "completions/min_terminated_length": 32.75, + "epoch": 0.9045, + "grad_norm": 4.110883712768555, + "kl": 18.984375, + "learning_rate": 5.562362976251901e-07, + "loss": 1.535, + "num_tokens": 51115438.0, + "reward": 1.5, + "reward_std": 0.8454371988773346, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44187305867671967, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.41491810977458954, + "step": 1809, + "token_counts/after_target": 619.75, + "token_counts/after_think": 130.0, + "token_counts/before_target": 1856.25, + "token_counts/before_think": 1091.0 + }, + { + "avg_penalty/after_target": 2.1322184801101685, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.46762315928936005, + "avg_penalty/before_think": 0.4404817521572113, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 849.0, + "completions/max_terminated_length": 598.5, + "completions/mean_length": 230.203125, + "completions/mean_terminated_length": 204.27396392822266, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.905, + "grad_norm": 9.0643892288208, + "kl": 28.65625, + "learning_rate": 5.505108784246926e-07, + "loss": 2.1441, + "num_tokens": 51142859.0, + "reward": 1.484375, + "reward_std": 0.8546489626169205, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44187305867671967, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.42263223975896835, + "step": 1810, + "token_counts/after_target": 658.5, + "token_counts/after_think": 108.5, + "token_counts/before_target": 2091.0, + "token_counts/before_think": 825.25 + }, + { + "avg_penalty/after_target": 2.9106993079185486, + "avg_penalty/after_think": 3.774173617362976, + "avg_penalty/before_target": 0.3882523514330387, + "avg_penalty/before_think": 0.37395672500133514, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 676.0, + "completions/max_terminated_length": 676.0, + "completions/mean_length": 188.8125, + "completions/mean_terminated_length": 188.8125, + "completions/min_length": 42.25, + "completions/min_terminated_length": 42.25, + "epoch": 0.9055, + "grad_norm": 5.593165874481201, + "kl": 19.890625, + "learning_rate": 5.448142440068316e-07, + "loss": 1.7569, + "num_tokens": 51164495.0, + "reward": 1.65625, + "reward_std": 0.7508548945188522, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.40311288833618164, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.3622080758213997, + "step": 1811, + "token_counts/after_target": 417.5, + "token_counts/after_think": 27.0, + "token_counts/before_target": 1653.5, + "token_counts/before_think": 923.0 + }, + { + "avg_penalty/after_target": 2.3843441903591156, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.5266795977950096, + "avg_penalty/before_think": 0.3331136256456375, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 691.0, + "completions/max_terminated_length": 691.0, + "completions/mean_length": 242.96875, + "completions/mean_terminated_length": 242.96875, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.906, + "grad_norm": 5.020679950714111, + "kl": 28.09375, + "learning_rate": 5.391464117245471e-07, + "loss": 2.2551, + "num_tokens": 51192637.0, + "reward": 1.4375, + "reward_std": 0.9088309109210968, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4761601909995079, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.428238607943058, + "step": 1812, + "token_counts/after_target": 937.75, + "token_counts/after_think": 113.25, + "token_counts/before_target": 1828.25, + "token_counts/before_think": 1008.25 + }, + { + "avg_penalty/after_target": 2.156361848115921, + "avg_penalty/after_think": 3.6172162294387817, + "avg_penalty/before_target": 0.3620261922478676, + "avg_penalty/before_think": 0.5417445376515388, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.25, + "completions/max_terminated_length": 628.25, + "completions/mean_length": 229.1875, + "completions/mean_terminated_length": 229.1875, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.9065, + "grad_norm": 5.467823028564453, + "kl": 14.390625, + "learning_rate": 5.335073988430373e-07, + "loss": 1.4008, + "num_tokens": 51219865.0, + "reward": 1.6953125, + "reward_std": 0.6520241349935532, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4097762927412987, + "rewards/tag_count_reward/mean": 0.8828125, + "rewards/tag_count_reward/std": 0.25280462577939034, + "step": 1813, + "token_counts/after_target": 363.75, + "token_counts/after_think": 308.0, + "token_counts/before_target": 1928.25, + "token_counts/before_think": 1067.0 + }, + { + "avg_penalty/after_target": 2.4627579748630524, + "avg_penalty/after_think": 3.2987639904022217, + "avg_penalty/before_target": 0.42585518956184387, + "avg_penalty/before_think": 0.4185573235154152, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 643.0, + "completions/max_terminated_length": 643.0, + "completions/mean_length": 204.78125, + "completions/mean_terminated_length": 204.78125, + "completions/min_length": 40.75, + "completions/min_terminated_length": 40.75, + "epoch": 0.907, + "grad_norm": 4.974770545959473, + "kl": 25.015625, + "learning_rate": 5.278972225397128e-07, + "loss": 2.205, + "num_tokens": 51241947.0, + "reward": 1.58984375, + "reward_std": 0.7301896959543228, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4000816270709038, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.33299384266138077, + "step": 1814, + "token_counts/after_target": 689.25, + "token_counts/after_think": 28.25, + "token_counts/before_target": 1760.0, + "token_counts/before_think": 799.0 + }, + { + "avg_penalty/after_target": 2.5073155760765076, + "avg_penalty/after_think": 2.7098355293273926, + "avg_penalty/before_target": 0.31718968227505684, + "avg_penalty/before_think": 0.54825060069561, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.25, + "completions/max_terminated_length": 586.25, + "completions/mean_length": 237.234375, + "completions/mean_terminated_length": 237.234375, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.9075, + "grad_norm": 9.229466438293457, + "kl": 25.0, + "learning_rate": 5.223158999041444e-07, + "loss": 1.8855, + "num_tokens": 51266442.0, + "reward": 1.41796875, + "reward_std": 0.8805011063814163, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.4840351790189743, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.42255035042762756, + "step": 1815, + "token_counts/after_target": 665.0, + "token_counts/after_think": 46.25, + "token_counts/before_target": 2033.5, + "token_counts/before_think": 1051.0 + }, + { + "avg_penalty/after_target": 2.774721533060074, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.40668369829654694, + "avg_penalty/before_think": 0.41211478412151337, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.5, + "completions/max_terminated_length": 744.5, + "completions/mean_length": 241.3125, + "completions/mean_terminated_length": 241.3125, + "completions/min_length": 40.25, + "completions/min_terminated_length": 40.25, + "epoch": 0.908, + "grad_norm": 7.037314414978027, + "kl": 24.0, + "learning_rate": 5.167634479380068e-07, + "loss": 1.8536, + "num_tokens": 51294830.0, + "reward": 1.52734375, + "reward_std": 0.8245432376861572, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.3778631240129471, + "step": 1816, + "token_counts/after_target": 698.25, + "token_counts/after_think": 10.0, + "token_counts/before_target": 2015.0, + "token_counts/before_think": 1137.75 + }, + { + "avg_penalty/after_target": 1.7452600002288818, + "avg_penalty/after_think": 3.801143169403076, + "avg_penalty/before_target": 0.5083757862448692, + "avg_penalty/before_think": 0.6468274742364883, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.5, + "completions/max_terminated_length": 609.5, + "completions/mean_length": 232.421875, + "completions/mean_terminated_length": 232.421875, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.9085, + "grad_norm": 4.0372724533081055, + "kl": 16.84375, + "learning_rate": 5.112398835550348e-07, + "loss": 1.5487, + "num_tokens": 51321417.0, + "reward": 1.5859375, + "reward_std": 0.7270862609148026, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4308478757739067, + "rewards/tag_count_reward/mean": 0.8359375, + "rewards/tag_count_reward/std": 0.3266169838607311, + "step": 1817, + "token_counts/after_target": 783.25, + "token_counts/after_think": 152.25, + "token_counts/before_target": 1268.75, + "token_counts/before_think": 1514.5 + }, + { + "avg_penalty/after_target": 1.9135377407073975, + "avg_penalty/after_think": 2.688149571418762, + "avg_penalty/before_target": 0.37219860032200813, + "avg_penalty/before_think": 0.598905511200428, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.25, + "completions/max_terminated_length": 651.25, + "completions/mean_length": 230.484375, + "completions/mean_terminated_length": 230.484375, + "completions/min_length": 32.25, + "completions/min_terminated_length": 32.25, + "epoch": 0.909, + "grad_norm": 3.049360513687134, + "kl": 14.26953125, + "learning_rate": 5.057452235809623e-07, + "loss": 1.2282, + "num_tokens": 51345464.0, + "reward": 1.56640625, + "reward_std": 0.6392254531383514, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4040650501847267, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.26381272077560425, + "step": 1818, + "token_counts/after_target": 601.5, + "token_counts/after_think": 106.25, + "token_counts/before_target": 1697.75, + "token_counts/before_think": 1282.25 + }, + { + "avg_penalty/after_target": 1.8610369563102722, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.5491693653166294, + "avg_penalty/before_think": 0.62847750633955, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 864.0, + "completions/max_terminated_length": 746.0, + "completions/mean_length": 298.046875, + "completions/mean_terminated_length": 286.58021545410156, + "completions/min_length": 63.5, + "completions/min_terminated_length": 63.5, + "epoch": 0.9095, + "grad_norm": 4.44058895111084, + "kl": 25.90625, + "learning_rate": 5.002794847534765e-07, + "loss": 2.127, + "num_tokens": 51378219.0, + "reward": 1.51171875, + "reward_std": 0.9595192223787308, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.11967839300632477, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.46034691482782364, + "rewards/tag_count_reward/mean": 0.73046875, + "rewards/tag_count_reward/std": 0.4330151230096817, + "step": 1819, + "token_counts/after_target": 1016.0, + "token_counts/after_think": 320.5, + "token_counts/before_target": 2493.75, + "token_counts/before_think": 938.5 + }, + { + "avg_penalty/after_target": 2.609332501888275, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4557825028896332, + "avg_penalty/before_think": 0.35258834064006805, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.75, + "completions/max_terminated_length": 543.75, + "completions/mean_length": 212.421875, + "completions/mean_terminated_length": 212.421875, + "completions/min_length": 51.75, + "completions/min_terminated_length": 51.75, + "epoch": 0.91, + "grad_norm": 5.163636207580566, + "kl": 17.921875, + "learning_rate": 4.948426837221632e-07, + "loss": 1.4569, + "num_tokens": 51401942.0, + "reward": 1.625, + "reward_std": 0.7576601803302765, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4022643193602562, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.3649432808160782, + "step": 1820, + "token_counts/after_target": 477.75, + "token_counts/after_think": 33.25, + "token_counts/before_target": 1386.0, + "token_counts/before_think": 1501.75 + }, + { + "avg_penalty/after_target": 2.788352310657501, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.31701405346393585, + "avg_penalty/before_think": 0.6234227269887924, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 219.359375, + "completions/mean_terminated_length": 219.359375, + "completions/min_length": 43.25, + "completions/min_terminated_length": 43.25, + "epoch": 0.9105, + "grad_norm": 5.347748756408691, + "kl": 24.84375, + "learning_rate": 4.894348370484648e-07, + "loss": 1.9735, + "num_tokens": 51424925.0, + "reward": 1.421875, + "reward_std": 0.8853699713945389, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4704566150903702, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.43427834659814835, + "step": 1821, + "token_counts/after_target": 572.5, + "token_counts/after_think": 37.75, + "token_counts/before_target": 1984.25, + "token_counts/before_think": 915.25 + }, + { + "avg_penalty/after_target": 2.9851279258728027, + "avg_penalty/after_think": 1.5269754528999329, + "avg_penalty/before_target": 0.30608297511935234, + "avg_penalty/before_think": 0.36668749153614044, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.5, + "completions/max_terminated_length": 454.5, + "completions/mean_length": 145.8125, + "completions/mean_terminated_length": 145.8125, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.911, + "grad_norm": 7.766767501831055, + "kl": 16.75, + "learning_rate": 4.840559612056184e-07, + "loss": 1.6517, + "num_tokens": 51445841.0, + "reward": 1.6953125, + "reward_std": 0.7265808284282684, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38772592693567276, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.32992538437247276, + "step": 1822, + "token_counts/after_target": 396.75, + "token_counts/after_think": 9.75, + "token_counts/before_target": 1148.0, + "token_counts/before_think": 778.5 + }, + { + "avg_penalty/after_target": 2.6939941346645355, + "avg_penalty/after_think": 3.816153585910797, + "avg_penalty/before_target": 0.3569910079240799, + "avg_penalty/before_think": 0.466617226600647, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.25, + "completions/max_terminated_length": 482.25, + "completions/mean_length": 184.78125, + "completions/mean_terminated_length": 184.78125, + "completions/min_length": 36.75, + "completions/min_terminated_length": 36.75, + "epoch": 0.9115, + "grad_norm": 3.8914599418640137, + "kl": 17.71875, + "learning_rate": 4.787060725786141e-07, + "loss": 1.6577, + "num_tokens": 51465443.0, + "reward": 1.5859375, + "reward_std": 0.778015211224556, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42867646366357803, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.36634697765111923, + "step": 1823, + "token_counts/after_target": 397.0, + "token_counts/after_think": 152.75, + "token_counts/before_target": 1389.75, + "token_counts/before_think": 1017.0 + }, + { + "avg_penalty/after_target": 1.9014167785644531, + "avg_penalty/after_think": 2.7722902297973633, + "avg_penalty/before_target": 0.6264310926198959, + "avg_penalty/before_think": 0.5603676065802574, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.25, + "completions/max_terminated_length": 686.25, + "completions/mean_length": 275.53125, + "completions/mean_terminated_length": 275.53125, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.912, + "grad_norm": 3.294097423553467, + "kl": 18.734375, + "learning_rate": 4.733851874641382e-07, + "loss": 1.6979, + "num_tokens": 51495509.0, + "reward": 1.55078125, + "reward_std": 0.7727174162864685, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44938503205776215, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.3494512289762497, + "step": 1824, + "token_counts/after_target": 1127.0, + "token_counts/after_think": 12.0, + "token_counts/before_target": 1698.0, + "token_counts/before_think": 1571.5 + }, + { + "avg_penalty/after_target": 2.236967921257019, + "avg_penalty/after_think": 2.234582543373108, + "avg_penalty/before_target": 0.43098704516887665, + "avg_penalty/before_think": 0.6375631913542747, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.25, + "completions/max_terminated_length": 637.25, + "completions/mean_length": 221.796875, + "completions/mean_terminated_length": 221.796875, + "completions/min_length": 35.5, + "completions/min_terminated_length": 35.5, + "epoch": 0.9125, + "grad_norm": 3.3504152297973633, + "kl": 22.45703125, + "learning_rate": 4.6809332207053083e-07, + "loss": 1.8926, + "num_tokens": 51518744.0, + "reward": 1.4921875, + "reward_std": 0.6671957820653915, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.36797719448804855, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.311381459236145, + "step": 1825, + "token_counts/after_target": 719.0, + "token_counts/after_think": 63.25, + "token_counts/before_target": 1777.0, + "token_counts/before_think": 989.5 + }, + { + "avg_penalty/after_target": 2.4939818382263184, + "avg_penalty/after_think": 3.680673658847809, + "avg_penalty/before_target": 0.30661285668611526, + "avg_penalty/before_think": 0.5025744661688805, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.5, + "completions/max_terminated_length": 565.5, + "completions/mean_length": 210.625, + "completions/mean_terminated_length": 210.625, + "completions/min_length": 43.25, + "completions/min_terminated_length": 43.25, + "epoch": 0.913, + "grad_norm": 3.8185555934906006, + "kl": 20.4375, + "learning_rate": 4.628304925177318e-07, + "loss": 1.7444, + "num_tokens": 51543520.0, + "reward": 1.62109375, + "reward_std": 0.9191349297761917, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.1280868947505951, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4519384130835533, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.39552831649780273, + "step": 1826, + "token_counts/after_target": 389.0, + "token_counts/after_think": 168.5, + "token_counts/before_target": 1856.25, + "token_counts/before_think": 956.25 + }, + { + "avg_penalty/after_target": 2.1986903846263885, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4996580630540848, + "avg_penalty/before_think": 0.522061362862587, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 824.25, + "completions/max_terminated_length": 621.5, + "completions/mean_length": 229.46875, + "completions/mean_terminated_length": 201.99687957763672, + "completions/min_length": 41.5, + "completions/min_terminated_length": 41.5, + "epoch": 0.9135, + "grad_norm": 8.565945625305176, + "kl": 29.53125, + "learning_rate": 4.575967148372318e-07, + "loss": 2.3087, + "num_tokens": 51567966.0, + "reward": 1.4453125, + "reward_std": 0.8734038174152374, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4402689263224602, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.42167239636182785, + "step": 1827, + "token_counts/after_target": 739.0, + "token_counts/after_think": 98.0, + "token_counts/before_target": 2034.0, + "token_counts/before_think": 800.5 + }, + { + "avg_penalty/after_target": 2.758070945739746, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3222798481583595, + "avg_penalty/before_think": 0.5494344606995583, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 585.75, + "completions/max_terminated_length": 562.75, + "completions/mean_length": 195.21875, + "completions/mean_terminated_length": 183.9093780517578, + "completions/min_length": 37.75, + "completions/min_terminated_length": 37.75, + "epoch": 0.914, + "grad_norm": 4.8578948974609375, + "kl": 16.546875, + "learning_rate": 4.5239200497202654e-07, + "loss": 1.3777, + "num_tokens": 51592012.0, + "reward": 1.59765625, + "reward_std": 0.7333121299743652, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.3921433389186859, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.3621448576450348, + "step": 1828, + "token_counts/after_target": 563.75, + "token_counts/after_think": 88.75, + "token_counts/before_target": 1702.0, + "token_counts/before_think": 769.0 + }, + { + "avg_penalty/after_target": 2.390801727771759, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.368485651910305, + "avg_penalty/before_think": 0.3050914704799652, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 587.25, + "completions/max_terminated_length": 587.25, + "completions/mean_length": 180.359375, + "completions/mean_terminated_length": 180.359375, + "completions/min_length": 31.75, + "completions/min_terminated_length": 31.75, + "epoch": 0.9145, + "grad_norm": 4.399402618408203, + "kl": 20.46875, + "learning_rate": 4.4721637877656377e-07, + "loss": 1.8076, + "num_tokens": 51613235.0, + "reward": 1.59375, + "reward_std": 0.7325100749731064, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.41110680997371674, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.34366922080516815, + "step": 1829, + "token_counts/after_target": 511.25, + "token_counts/after_think": 126.75, + "token_counts/before_target": 1229.0, + "token_counts/before_think": 1018.75 + }, + { + "avg_penalty/after_target": 2.532071828842163, + "avg_penalty/after_think": 3.5867373943328857, + "avg_penalty/before_target": 0.4934747517108917, + "avg_penalty/before_think": 0.7448433116078377, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 776.5, + "completions/max_terminated_length": 743.75, + "completions/mean_length": 287.625, + "completions/mean_terminated_length": 275.7875061035156, + "completions/min_length": 44.5, + "completions/min_terminated_length": 44.5, + "epoch": 0.915, + "grad_norm": 5.1749958992004395, + "kl": 26.03125, + "learning_rate": 4.420698520166988e-07, + "loss": 2.1261, + "num_tokens": 51643563.0, + "reward": 1.47265625, + "reward_std": 0.7914433181285858, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4613594636321068, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.3817756846547127, + "step": 1830, + "token_counts/after_target": 1069.25, + "token_counts/after_think": 31.0, + "token_counts/before_target": 2407.25, + "token_counts/before_think": 1094.5 + }, + { + "avg_penalty/after_target": 2.2895217537879944, + "avg_penalty/after_think": 3.671001136302948, + "avg_penalty/before_target": 0.4631265327334404, + "avg_penalty/before_think": 0.4858423173427582, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 733.5, + "completions/max_terminated_length": 733.5, + "completions/mean_length": 232.8125, + "completions/mean_terminated_length": 232.8125, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.9155, + "grad_norm": 6.94297456741333, + "kl": 31.09375, + "learning_rate": 4.3695244036964567e-07, + "loss": 2.428, + "num_tokens": 51671983.0, + "reward": 1.40234375, + "reward_std": 0.8905564695596695, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.48148179799318314, + "rewards/tag_count_reward/mean": 0.73046875, + "rewards/tag_count_reward/std": 0.4266953766345978, + "step": 1831, + "token_counts/after_target": 853.25, + "token_counts/after_think": 101.0, + "token_counts/before_target": 2063.5, + "token_counts/before_think": 707.25 + }, + { + "avg_penalty/after_target": 2.4901085793972015, + "avg_penalty/after_think": 3.8992040157318115, + "avg_penalty/before_target": 0.5143763944506645, + "avg_penalty/before_think": 0.5609957203269005, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 699.5, + "completions/max_terminated_length": 699.5, + "completions/mean_length": 256.40625, + "completions/mean_terminated_length": 256.40625, + "completions/min_length": 36.75, + "completions/min_terminated_length": 36.75, + "epoch": 0.916, + "grad_norm": 5.451164245605469, + "kl": 23.15625, + "learning_rate": 4.318641594239259e-07, + "loss": 2.1475, + "num_tokens": 51697705.0, + "reward": 1.5546875, + "reward_std": 0.7738254964351654, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4383598491549492, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.351142268627882, + "step": 1832, + "token_counts/after_target": 1044.0, + "token_counts/after_think": 45.25, + "token_counts/before_target": 1671.0, + "token_counts/before_think": 1342.25 + }, + { + "avg_penalty/after_target": 1.667093575000763, + "avg_penalty/after_think": 3.210534691810608, + "avg_penalty/before_target": 0.3794799745082855, + "avg_penalty/before_think": 0.38543038070201874, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 173.53125, + "completions/mean_terminated_length": 173.53125, + "completions/min_length": 38.75, + "completions/min_terminated_length": 38.75, + "epoch": 0.9165, + "grad_norm": 7.676076889038086, + "kl": 17.5078125, + "learning_rate": 4.268050246793276e-07, + "loss": 1.29, + "num_tokens": 51717563.0, + "reward": 1.5625, + "reward_std": 0.8115668445825577, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4229728877544403, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.40106870234012604, + "step": 1833, + "token_counts/after_target": 346.0, + "token_counts/after_think": 65.0, + "token_counts/before_target": 1390.0, + "token_counts/before_think": 975.5 + }, + { + "avg_penalty/after_target": 1.5486309826374054, + "avg_penalty/after_think": 3.3651577532291412, + "avg_penalty/before_target": 0.31124408915638924, + "avg_penalty/before_think": 0.5213892012834549, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.75, + "completions/max_terminated_length": 546.75, + "completions/mean_length": 183.796875, + "completions/mean_terminated_length": 183.796875, + "completions/min_length": 41.5, + "completions/min_terminated_length": 41.5, + "epoch": 0.917, + "grad_norm": 7.5725812911987305, + "kl": 19.75, + "learning_rate": 4.2177505154685215e-07, + "loss": 1.4809, + "num_tokens": 51738398.0, + "reward": 1.40625, + "reward_std": 0.8727664351463318, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.47354350984096527, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.4177818149328232, + "step": 1834, + "token_counts/after_target": 360.75, + "token_counts/after_think": 58.5, + "token_counts/before_target": 1510.25, + "token_counts/before_think": 1011.25 + }, + { + "avg_penalty/after_target": 2.8985812067985535, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.5238592699170113, + "avg_penalty/before_think": 0.5178559198975563, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 689.75, + "completions/max_terminated_length": 689.75, + "completions/mean_length": 242.40625, + "completions/mean_terminated_length": 242.40625, + "completions/min_length": 52.75, + "completions/min_terminated_length": 52.75, + "epoch": 0.9175, + "grad_norm": 6.006483554840088, + "kl": 22.3125, + "learning_rate": 4.167742553486676e-07, + "loss": 2.0819, + "num_tokens": 51763288.0, + "reward": 1.53125, + "reward_std": 0.7519092857837677, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.43494731932878494, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.35645730793476105, + "step": 1835, + "token_counts/after_target": 905.0, + "token_counts/after_think": 48.5, + "token_counts/before_target": 1843.0, + "token_counts/before_think": 1082.0 + }, + { + "avg_penalty/after_target": 3.0576798915863037, + "avg_penalty/after_think": 3.8882389664649963, + "avg_penalty/before_target": 0.3984721153974533, + "avg_penalty/before_think": 0.5831617414951324, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 735.25, + "completions/max_terminated_length": 735.25, + "completions/mean_length": 229.859375, + "completions/mean_terminated_length": 229.859375, + "completions/min_length": 38.5, + "completions/min_terminated_length": 38.5, + "epoch": 0.918, + "grad_norm": 3.392948627471924, + "kl": 24.09375, + "learning_rate": 4.118026513180695e-07, + "loss": 2.1266, + "num_tokens": 51789343.0, + "reward": 1.4921875, + "reward_std": 0.7685577571392059, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.46039126068353653, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.3541737385094166, + "step": 1836, + "token_counts/after_target": 593.25, + "token_counts/after_think": 197.5, + "token_counts/before_target": 2029.75, + "token_counts/before_think": 857.25 + }, + { + "avg_penalty/after_target": 2.2805094122886658, + "avg_penalty/after_think": 1.773576706647873, + "avg_penalty/before_target": 0.4572560489177704, + "avg_penalty/before_think": 0.5880823656916618, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.75, + "completions/max_terminated_length": 757.75, + "completions/mean_length": 236.171875, + "completions/mean_terminated_length": 236.171875, + "completions/min_length": 46.25, + "completions/min_terminated_length": 46.25, + "epoch": 0.9185, + "grad_norm": 3.855229616165161, + "kl": 20.5625, + "learning_rate": 4.068602545994249e-07, + "loss": 1.7656, + "num_tokens": 51813034.0, + "reward": 1.578125, + "reward_std": 0.7636384069919586, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.43303824216127396, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.3526933118700981, + "step": 1837, + "token_counts/after_target": 658.5, + "token_counts/after_think": 18.0, + "token_counts/before_target": 2068.0, + "token_counts/before_think": 1034.25 + }, + { + "avg_penalty/after_target": 2.3257138431072235, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3868021070957184, + "avg_penalty/before_think": 0.530545637011528, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.25, + "completions/max_terminated_length": 609.25, + "completions/mean_length": 200.34375, + "completions/mean_terminated_length": 200.34375, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.919, + "grad_norm": 4.880683422088623, + "kl": 20.9375, + "learning_rate": 4.019470802481307e-07, + "loss": 1.6728, + "num_tokens": 51836528.0, + "reward": 1.5, + "reward_std": 0.8316047340631485, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.3967379406094551, + "step": 1838, + "token_counts/after_target": 337.25, + "token_counts/after_think": 112.25, + "token_counts/before_target": 1581.25, + "token_counts/before_think": 1174.75 + }, + { + "avg_penalty/after_target": 2.7309950590133667, + "avg_penalty/after_think": 2.8699758052825928, + "avg_penalty/before_target": 0.5225117057561874, + "avg_penalty/before_think": 0.44674959778785706, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 739.25, + "completions/max_terminated_length": 739.25, + "completions/mean_length": 251.09375, + "completions/mean_terminated_length": 251.09375, + "completions/min_length": 57.75, + "completions/min_terminated_length": 57.75, + "epoch": 0.9195, + "grad_norm": 10.648810386657715, + "kl": 22.421875, + "learning_rate": 3.9706314323056936e-07, + "loss": 2.2623, + "num_tokens": 51863094.0, + "reward": 1.52734375, + "reward_std": 0.7818265706300735, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4079566150903702, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.38342586159706116, + "step": 1839, + "token_counts/after_target": 1119.5, + "token_counts/after_think": 81.75, + "token_counts/before_target": 1718.5, + "token_counts/before_think": 1097.75 + }, + { + "avg_penalty/after_target": 2.1595409512519836, + "avg_penalty/after_think": 3.9326654076576233, + "avg_penalty/before_target": 0.4151322543621063, + "avg_penalty/before_think": 0.7062039896845818, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.25, + "completions/max_terminated_length": 646.25, + "completions/mean_length": 250.3125, + "completions/mean_terminated_length": 250.3125, + "completions/min_length": 43.75, + "completions/min_terminated_length": 43.75, + "epoch": 0.92, + "grad_norm": 2.85007381439209, + "kl": 17.28125, + "learning_rate": 3.922084584240582e-07, + "loss": 1.5248, + "num_tokens": 51888586.0, + "reward": 1.453125, + "reward_std": 0.807273656129837, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.44187305867671967, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.3691457137465477, + "step": 1840, + "token_counts/after_target": 939.5, + "token_counts/after_think": 97.0, + "token_counts/before_target": 1694.5, + "token_counts/before_think": 1274.0 + }, + { + "avg_penalty/after_target": 2.6247373521327972, + "avg_penalty/after_think": 3.56581848859787, + "avg_penalty/before_target": 0.333002720028162, + "avg_penalty/before_think": 0.46846508979797363, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.25, + "completions/max_terminated_length": 450.25, + "completions/mean_length": 204.9375, + "completions/mean_terminated_length": 204.9375, + "completions/min_length": 40.75, + "completions/min_terminated_length": 40.75, + "epoch": 0.9205, + "grad_norm": 4.8437981605529785, + "kl": 22.9375, + "learning_rate": 3.8738304061681107e-07, + "loss": 1.9477, + "num_tokens": 51913654.0, + "reward": 1.53125, + "reward_std": 0.8206578195095062, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.44974804669618607, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.38050032407045364, + "step": 1841, + "token_counts/after_target": 529.75, + "token_counts/after_think": 121.75, + "token_counts/before_target": 1691.25, + "token_counts/before_think": 936.25 + }, + { + "avg_penalty/after_target": 2.256964772939682, + "avg_penalty/after_think": 3.821433365345001, + "avg_penalty/before_target": 0.34391339123249054, + "avg_penalty/before_think": 0.39818189293146133, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 584.0, + "completions/max_terminated_length": 584.0, + "completions/mean_length": 171.8125, + "completions/mean_terminated_length": 171.8125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.921, + "grad_norm": 3.6776156425476074, + "kl": 13.7421875, + "learning_rate": 3.825869045078867e-07, + "loss": 1.3139, + "num_tokens": 51935194.0, + "reward": 1.65234375, + "reward_std": 0.6638054400682449, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.38724804669618607, + "rewards/tag_count_reward/mean": 0.85546875, + "rewards/tag_count_reward/std": 0.3010594919323921, + "step": 1842, + "token_counts/after_target": 266.25, + "token_counts/after_think": 88.25, + "token_counts/before_target": 1257.75, + "token_counts/before_think": 1136.75 + }, + { + "avg_penalty/after_target": 2.6600066423416138, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.4415983781218529, + "avg_penalty/before_think": 0.45323698222637177, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 641.75, + "completions/max_terminated_length": 641.75, + "completions/mean_length": 243.546875, + "completions/mean_terminated_length": 243.546875, + "completions/min_length": 37.5, + "completions/min_terminated_length": 37.5, + "epoch": 0.9215, + "grad_norm": 4.293806076049805, + "kl": 18.859375, + "learning_rate": 3.7782006470714614e-07, + "loss": 1.5219, + "num_tokens": 51960429.0, + "reward": 1.43359375, + "reward_std": 0.8501105606555939, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4625816270709038, + "rewards/tag_count_reward/mean": 0.73046875, + "rewards/tag_count_reward/std": 0.4242735505104065, + "step": 1843, + "token_counts/after_target": 821.25, + "token_counts/after_think": 16.75, + "token_counts/before_target": 2071.75, + "token_counts/before_think": 987.0 + }, + { + "avg_penalty/after_target": 2.129592180252075, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.482023723423481, + "avg_penalty/before_think": 0.46152932196855545, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 742.25, + "completions/max_terminated_length": 613.75, + "completions/mean_length": 264.96875, + "completions/mean_terminated_length": 251.81771087646484, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.922, + "grad_norm": 2.822509765625, + "kl": 21.96875, + "learning_rate": 3.7308253573521193e-07, + "loss": 1.9303, + "num_tokens": 51987051.0, + "reward": 1.58984375, + "reward_std": 0.7640510201454163, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42867646366357803, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.35885218530893326, + "step": 1844, + "token_counts/after_target": 742.5, + "token_counts/after_think": 78.25, + "token_counts/before_target": 2312.5, + "token_counts/before_think": 1106.25 + }, + { + "avg_penalty/after_target": 3.007605791091919, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3612334877252579, + "avg_penalty/before_think": 0.4173302613198757, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 752.5, + "completions/max_terminated_length": 708.0, + "completions/mean_length": 242.53125, + "completions/mean_terminated_length": 230.14479446411133, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.9225, + "grad_norm": 4.755565166473389, + "kl": 25.71875, + "learning_rate": 3.68374332023419e-07, + "loss": 2.2668, + "num_tokens": 52012909.0, + "reward": 1.4921875, + "reward_std": 0.823324903845787, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4682852029800415, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.38085123151540756, + "step": 1845, + "token_counts/after_target": 728.75, + "token_counts/after_think": 134.75, + "token_counts/before_target": 2135.75, + "token_counts/before_think": 881.25 + }, + { + "avg_penalty/after_target": 2.555183708667755, + "avg_penalty/after_think": 2.5569257736206055, + "avg_penalty/before_target": 0.27170972526073456, + "avg_penalty/before_think": 0.3454698361456394, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.75, + "completions/max_terminated_length": 504.75, + "completions/mean_length": 171.15625, + "completions/mean_terminated_length": 171.15625, + "completions/min_length": 42.25, + "completions/min_terminated_length": 42.25, + "epoch": 0.923, + "grad_norm": 4.508715629577637, + "kl": 18.15625, + "learning_rate": 3.6369546791377054e-07, + "loss": 1.4816, + "num_tokens": 52036615.0, + "reward": 1.5859375, + "reward_std": 0.7756961584091187, + "rewards/accuracy_reward/mean": NaN, + "rewards/accuracy_reward/std": NaN, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4057852029800415, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.37646619975566864, + "step": 1846, + "token_counts/after_target": 275.25, + "token_counts/after_think": 86.25, + "token_counts/before_target": 1371.0, + "token_counts/before_think": 1006.0 + }, + { + "avg_penalty/after_target": 3.2102335691452026, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.2925017811357975, + "avg_penalty/before_think": 0.44293009489774704, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 196.5, + "completions/mean_terminated_length": 196.5, + "completions/min_length": 53.75, + "completions/min_terminated_length": 53.75, + "epoch": 0.9235, + "grad_norm": 3.2587411403656006, + "kl": 17.578125, + "learning_rate": 3.590459576589e-07, + "loss": 1.5688, + "num_tokens": 52058183.0, + "reward": 1.46484375, + "reward_std": 0.831116572022438, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4546433389186859, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.3910333886742592, + "step": 1847, + "token_counts/after_target": 579.25, + "token_counts/after_think": 18.25, + "token_counts/before_target": 1431.0, + "token_counts/before_think": 1115.5 + }, + { + "avg_penalty/after_target": 2.133036494255066, + "avg_penalty/after_think": 3.9470723271369934, + "avg_penalty/before_target": 0.42400945723056793, + "avg_penalty/before_think": 0.4777248576283455, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 745.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 261.4375, + "completions/mean_terminated_length": 261.4375, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.924, + "grad_norm": 5.210468769073486, + "kl": 17.796875, + "learning_rate": 3.544258154220193e-07, + "loss": 1.7174, + "num_tokens": 52083939.0, + "reward": 1.69140625, + "reward_std": 0.6247414499521255, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.89453125, + "rewards/tag_count_reward/std": 0.2334109414368868, + "step": 1848, + "token_counts/after_target": 547.75, + "token_counts/after_think": 145.0, + "token_counts/before_target": 2457.75, + "token_counts/before_think": 1032.5 + }, + { + "avg_penalty/after_target": 2.21046844124794, + "avg_penalty/after_think": 3.7131194472312927, + "avg_penalty/before_target": 0.36223645508289337, + "avg_penalty/before_think": 0.5512705966830254, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 215.34375, + "completions/mean_terminated_length": 215.34375, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.9245, + "grad_norm": 11.347973823547363, + "kl": 14.875, + "learning_rate": 3.498350552768859e-07, + "loss": 1.693, + "num_tokens": 52107689.0, + "reward": 1.7109375, + "reward_std": 0.5725717097520828, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.30899807065725327, + "rewards/tag_count_reward/mean": 0.8671875, + "rewards/tag_count_reward/std": 0.28297239542007446, + "step": 1849, + "token_counts/after_target": 596.75, + "token_counts/after_think": 237.0, + "token_counts/before_target": 1183.5, + "token_counts/before_think": 1428.25 + }, + { + "avg_penalty/after_target": 2.3146474361419678, + "avg_penalty/after_think": 3.4118661880493164, + "avg_penalty/before_target": 0.3876926228404045, + "avg_penalty/before_think": 0.5727883651852608, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 746.0, + "completions/max_terminated_length": 746.0, + "completions/mean_length": 235.703125, + "completions/mean_terminated_length": 235.703125, + "completions/min_length": 38.75, + "completions/min_terminated_length": 38.75, + "epoch": 0.925, + "grad_norm": 6.172056198120117, + "kl": 16.75, + "learning_rate": 3.4527369120775036e-07, + "loss": 1.6555, + "num_tokens": 52137686.0, + "reward": 1.640625, + "reward_std": 0.6912432163953781, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.3842606768012047, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.32539571076631546, + "step": 1850, + "token_counts/after_target": 854.5, + "token_counts/after_think": 107.0, + "token_counts/before_target": 1798.25, + "token_counts/before_think": 1011.5 + }, + { + "avg_penalty/after_target": 1.715819627046585, + "avg_penalty/after_think": 3.9296977519989014, + "avg_penalty/before_target": 0.3296767547726631, + "avg_penalty/before_think": 0.5118326619267464, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.5, + "completions/max_terminated_length": 555.5, + "completions/mean_length": 180.484375, + "completions/mean_terminated_length": 180.484375, + "completions/min_length": 40.5, + "completions/min_terminated_length": 40.5, + "epoch": 0.9255, + "grad_norm": 14.319005966186523, + "kl": 20.046875, + "learning_rate": 3.4074173710931804e-07, + "loss": 1.759, + "num_tokens": 52158437.0, + "reward": 1.609375, + "reward_std": 0.7499426305294037, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.41194770485162735, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.35015954822301865, + "step": 1851, + "token_counts/after_target": 319.25, + "token_counts/after_think": 185.0, + "token_counts/before_target": 1442.0, + "token_counts/before_think": 941.5 + }, + { + "avg_penalty/after_target": 2.292931228876114, + "avg_penalty/after_think": 2.7166327238082886, + "avg_penalty/before_target": 0.37201426550745964, + "avg_penalty/before_think": 0.6017696335911751, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.0, + "completions/max_terminated_length": 594.0, + "completions/mean_length": 214.140625, + "completions/mean_terminated_length": 214.140625, + "completions/min_length": 36.75, + "completions/min_terminated_length": 36.75, + "epoch": 0.926, + "grad_norm": 3.227280616760254, + "kl": 12.8046875, + "learning_rate": 3.3623920678670597e-07, + "loss": 1.1847, + "num_tokens": 52183422.0, + "reward": 1.5859375, + "reward_std": 0.7150942236185074, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.404181070625782, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.346855565905571, + "step": 1852, + "token_counts/after_target": 736.0, + "token_counts/after_think": 32.5, + "token_counts/before_target": 1656.25, + "token_counts/before_think": 1001.5 + }, + { + "avg_penalty/after_target": 2.0456666946411133, + "avg_penalty/after_think": 2.4740200638771057, + "avg_penalty/before_target": 0.5181717872619629, + "avg_penalty/before_think": 0.47728585451841354, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 704.0, + "completions/max_terminated_length": 704.0, + "completions/mean_length": 212.625, + "completions/mean_terminated_length": 212.625, + "completions/min_length": 50.25, + "completions/min_terminated_length": 50.25, + "epoch": 0.9265, + "grad_norm": 3.8302412033081055, + "kl": 23.03125, + "learning_rate": 3.3176611395540625e-07, + "loss": 2.049, + "num_tokens": 52208358.0, + "reward": 1.546875, + "reward_std": 0.8219835311174393, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4440634250640869, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.39426781982183456, + "step": 1853, + "token_counts/after_target": 559.5, + "token_counts/after_think": 56.25, + "token_counts/before_target": 1560.5, + "token_counts/before_think": 1225.75 + }, + { + "avg_penalty/after_target": 2.115464836359024, + "avg_penalty/after_think": 3.6748024821281433, + "avg_penalty/before_target": 0.44874345511198044, + "avg_penalty/before_think": 0.40266750007867813, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 669.5, + "completions/max_terminated_length": 669.5, + "completions/mean_length": 189.765625, + "completions/mean_terminated_length": 189.765625, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.927, + "grad_norm": 12.163640975952148, + "kl": 16.1875, + "learning_rate": 3.273224722412327e-07, + "loss": 1.817, + "num_tokens": 52230183.0, + "reward": 1.7734375, + "reward_std": 0.532820463180542, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.28694770485162735, + "rewards/tag_count_reward/mean": 0.8984375, + "rewards/tag_count_reward/std": 0.26180046796798706, + "step": 1854, + "token_counts/after_target": 583.5, + "token_counts/after_think": 87.25, + "token_counts/before_target": 1369.75, + "token_counts/before_think": 995.75 + }, + { + "avg_penalty/after_target": 1.9252059608697891, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.48016542568802834, + "avg_penalty/before_think": 0.3839472569525242, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 704.75, + "completions/max_terminated_length": 704.75, + "completions/mean_length": 225.6875, + "completions/mean_terminated_length": 225.6875, + "completions/min_length": 30.25, + "completions/min_terminated_length": 30.25, + "epoch": 0.9275, + "grad_norm": 3.0710866451263428, + "kl": 24.6875, + "learning_rate": 3.2290829518028867e-07, + "loss": 2.0554, + "num_tokens": 52255139.0, + "reward": 1.578125, + "reward_std": 0.7956727594137192, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4440634250640869, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.355011910200119, + "step": 1855, + "token_counts/after_target": 677.0, + "token_counts/after_think": 89.5, + "token_counts/before_target": 1753.25, + "token_counts/before_think": 1091.25 + }, + { + "avg_penalty/after_target": 1.647615373134613, + "avg_penalty/after_think": 3.8223491311073303, + "avg_penalty/before_target": 0.5725319012999535, + "avg_penalty/before_think": 0.3902988061308861, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 587.25, + "completions/max_terminated_length": 587.25, + "completions/mean_length": 203.328125, + "completions/mean_terminated_length": 203.328125, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.928, + "grad_norm": 4.35113000869751, + "kl": 25.5, + "learning_rate": 3.185235962189237e-07, + "loss": 2.2877, + "num_tokens": 52278760.0, + "reward": 1.5078125, + "reward_std": 0.7947629541158676, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4682852029800415, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.34949661046266556, + "step": 1856, + "token_counts/after_target": 638.25, + "token_counts/after_think": 161.0, + "token_counts/before_target": 1399.0, + "token_counts/before_think": 1055.0 + }, + { + "avg_penalty/after_target": 1.6718849539756775, + "avg_penalty/after_think": 3.613288402557373, + "avg_penalty/before_target": 0.435436874628067, + "avg_penalty/before_think": 0.4437643587589264, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 666.5, + "completions/max_terminated_length": 609.75, + "completions/mean_length": 206.578125, + "completions/mean_terminated_length": 195.7062530517578, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.9285, + "grad_norm": 4.707767009735107, + "kl": 25.328125, + "learning_rate": 3.1416838871368925e-07, + "loss": 2.0851, + "num_tokens": 52302445.0, + "reward": 1.4453125, + "reward_std": 0.828660398721695, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4459725022315979, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.3957666605710983, + "step": 1857, + "token_counts/after_target": 703.25, + "token_counts/after_think": 25.5, + "token_counts/before_target": 1919.75, + "token_counts/before_think": 656.75 + }, + { + "avg_penalty/after_target": 2.1325753331184387, + "avg_penalty/after_think": 3.4177279472351074, + "avg_penalty/before_target": 0.6192266866564751, + "avg_penalty/before_think": 0.6305234730243683, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 692.0, + "completions/max_terminated_length": 692.0, + "completions/mean_length": 223.25, + "completions/mean_terminated_length": 223.25, + "completions/min_length": 40.25, + "completions/min_terminated_length": 40.25, + "epoch": 0.929, + "grad_norm": 6.947671890258789, + "kl": 20.71875, + "learning_rate": 3.098426859313053e-07, + "loss": 2.0231, + "num_tokens": 52327293.0, + "reward": 1.75, + "reward_std": 0.7371437102556229, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.36483466625213623, + "rewards/tag_count_reward/mean": 0.859375, + "rewards/tag_count_reward/std": 0.33167339861392975, + "step": 1858, + "token_counts/after_target": 1043.75, + "token_counts/after_think": 43.75, + "token_counts/before_target": 1339.25, + "token_counts/before_think": 1145.25 + }, + { + "avg_penalty/after_target": 2.2436623871326447, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4789597913622856, + "avg_penalty/before_think": 0.6639351919293404, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 709.5, + "completions/max_terminated_length": 709.5, + "completions/mean_length": 247.265625, + "completions/mean_terminated_length": 247.265625, + "completions/min_length": 44.75, + "completions/min_terminated_length": 44.75, + "epoch": 0.9295, + "grad_norm": 4.7075371742248535, + "kl": 21.1875, + "learning_rate": 3.0554650104861137e-07, + "loss": 1.9767, + "num_tokens": 52352766.0, + "reward": 1.55078125, + "reward_std": 0.7606871426105499, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44938503205776215, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.3494030684232712, + "step": 1859, + "token_counts/after_target": 1031.75, + "token_counts/after_think": 60.0, + "token_counts/before_target": 2008.0, + "token_counts/before_think": 856.5 + }, + { + "avg_penalty/after_target": 1.5976937115192413, + "avg_penalty/after_think": 3.374137222766876, + "avg_penalty/before_target": 0.4797305166721344, + "avg_penalty/before_think": 0.35945019870996475, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.25, + "completions/max_terminated_length": 553.25, + "completions/mean_length": 173.390625, + "completions/mean_terminated_length": 173.390625, + "completions/min_length": 38.75, + "completions/min_terminated_length": 38.75, + "epoch": 0.93, + "grad_norm": 3.9344406127929688, + "kl": 28.5, + "learning_rate": 3.0127984715253246e-07, + "loss": 2.3135, + "num_tokens": 52372535.0, + "reward": 1.53515625, + "reward_std": 0.958944097161293, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.11967839300632477, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4604102149605751, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.4329407960176468, + "step": 1860, + "token_counts/after_target": 573.75, + "token_counts/after_think": 65.75, + "token_counts/before_target": 1643.0, + "token_counts/before_think": 491.75 + }, + { + "avg_penalty/after_target": 3.0787791907787323, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.2787284553050995, + "avg_penalty/before_think": 0.6177172958850861, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 678.0, + "completions/max_terminated_length": 582.75, + "completions/mean_length": 242.265625, + "completions/mean_terminated_length": 230.8937530517578, + "completions/min_length": 52.25, + "completions/min_terminated_length": 52.25, + "epoch": 0.9305, + "grad_norm": 3.239339590072632, + "kl": 19.8671875, + "learning_rate": 2.970427372400353e-07, + "loss": 1.6619, + "num_tokens": 52400520.0, + "reward": 1.5390625, + "reward_std": 0.7489597797393799, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.42206869274377823, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.3189389482140541, + "step": 1861, + "token_counts/after_target": 532.75, + "token_counts/after_think": 265.25, + "token_counts/before_target": 1842.0, + "token_counts/before_think": 1236.25 + }, + { + "avg_penalty/after_target": 2.85556298494339, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3898920789361, + "avg_penalty/before_think": 0.408449649810791, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 679.75, + "completions/max_terminated_length": 583.25, + "completions/mean_length": 226.390625, + "completions/mean_terminated_length": 214.52291870117188, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.931, + "grad_norm": 4.208570957183838, + "kl": 20.203125, + "learning_rate": 2.928351842180921e-07, + "loss": 1.8138, + "num_tokens": 52423329.0, + "reward": 1.56640625, + "reward_std": 0.7793923616409302, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.43399807065725327, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.3599143847823143, + "step": 1862, + "token_counts/after_target": 669.0, + "token_counts/after_think": 146.5, + "token_counts/before_target": 1829.25, + "token_counts/before_think": 977.5 + }, + { + "avg_penalty/after_target": 2.346538484096527, + "avg_penalty/after_think": 3.7101438641548157, + "avg_penalty/before_target": 0.5935404896736145, + "avg_penalty/before_think": 0.5800086036324501, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 805.25, + "completions/max_terminated_length": 669.5, + "completions/mean_length": 205.84375, + "completions/mean_terminated_length": 192.9937515258789, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.9315, + "grad_norm": 7.787662982940674, + "kl": 18.90625, + "learning_rate": 2.8865720090364037e-07, + "loss": 1.8737, + "num_tokens": 52447431.0, + "reward": 1.59375, + "reward_std": 0.7864574640989304, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.37671056389808655, + "step": 1863, + "token_counts/after_target": 856.5, + "token_counts/after_think": 109.0, + "token_counts/before_target": 1579.0, + "token_counts/before_think": 749.0 + }, + { + "avg_penalty/after_target": 2.6363402009010315, + "avg_penalty/after_think": 2.7217600345611572, + "avg_penalty/before_target": 0.691260501742363, + "avg_penalty/before_think": 0.4032859541475773, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 828.25, + "completions/max_terminated_length": 828.25, + "completions/mean_length": 259.953125, + "completions/mean_terminated_length": 259.953125, + "completions/min_length": 44.25, + "completions/min_terminated_length": 44.25, + "epoch": 0.932, + "grad_norm": 3.3128561973571777, + "kl": 31.5625, + "learning_rate": 2.8450880002353967e-07, + "loss": 2.6382, + "num_tokens": 52474852.0, + "reward": 1.46875, + "reward_std": 0.8457031399011612, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4493217319250107, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.40441984683275223, + "step": 1864, + "token_counts/after_target": 1113.75, + "token_counts/after_think": 25.5, + "token_counts/before_target": 1977.25, + "token_counts/before_think": 1042.75 + }, + { + "avg_penalty/after_target": 2.1589252948760986, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.5896811559796333, + "avg_penalty/before_think": 0.33405449986457825, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 748.0, + "completions/max_terminated_length": 748.0, + "completions/mean_length": 187.296875, + "completions/mean_terminated_length": 187.296875, + "completions/min_length": 46.5, + "completions/min_terminated_length": 46.5, + "epoch": 0.9325, + "grad_norm": 11.620245933532715, + "kl": 22.03125, + "learning_rate": 2.8038999421453827e-07, + "loss": 2.3302, + "num_tokens": 52496055.0, + "reward": 1.62890625, + "reward_std": 0.6920855343341827, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.40263500809669495, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.30850401520729065, + "step": 1865, + "token_counts/after_target": 727.75, + "token_counts/after_think": 99.0, + "token_counts/before_target": 1413.0, + "token_counts/before_think": 757.0 + }, + { + "avg_penalty/after_target": 2.393613874912262, + "avg_penalty/after_think": 2.3353317975997925, + "avg_penalty/before_target": 0.4561016671359539, + "avg_penalty/before_think": 0.3682888448238373, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.75, + "completions/max_terminated_length": 719.75, + "completions/mean_length": 205.859375, + "completions/mean_terminated_length": 205.859375, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.933, + "grad_norm": 3.7721574306488037, + "kl": 19.015625, + "learning_rate": 2.7630079602323447e-07, + "loss": 1.7156, + "num_tokens": 52518222.0, + "reward": 1.58984375, + "reward_std": 0.7445605397224426, + "rewards/accuracy_reward/mean": NaN, + "rewards/accuracy_reward/std": NaN, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.3925696536898613, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.3601352274417877, + "step": 1866, + "token_counts/after_target": 443.5, + "token_counts/after_think": 110.75, + "token_counts/before_target": 1182.0, + "token_counts/before_think": 1557.5 + }, + { + "avg_penalty/after_target": 2.2125468850135803, + "avg_penalty/after_think": 3.662417769432068, + "avg_penalty/before_target": 0.5228299424052238, + "avg_penalty/before_think": 0.6260585188865662, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.5, + "completions/max_terminated_length": 719.5, + "completions/mean_length": 217.15625, + "completions/mean_terminated_length": 217.15625, + "completions/min_length": 25.5, + "completions/min_terminated_length": 25.5, + "epoch": 0.9335, + "grad_norm": 4.576792240142822, + "kl": 22.171875, + "learning_rate": 2.7224121790603517e-07, + "loss": 2.0685, + "num_tokens": 52540568.0, + "reward": 1.62109375, + "reward_std": 0.6812863498926163, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4229728877544403, + "rewards/tag_count_reward/mean": 0.85546875, + "rewards/tag_count_reward/std": 0.2761028930544853, + "step": 1867, + "token_counts/after_target": 736.25, + "token_counts/after_think": 30.75, + "token_counts/before_target": 1781.0, + "token_counts/before_think": 926.5 + }, + { + "avg_penalty/after_target": 2.9587900638580322, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.38216955587267876, + "avg_penalty/before_think": 0.5049433633685112, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 916.0, + "completions/max_terminated_length": 759.5, + "completions/mean_length": 242.1875, + "completions/mean_terminated_length": 217.41875839233398, + "completions/min_length": 40.5, + "completions/min_terminated_length": 40.5, + "epoch": 0.934, + "grad_norm": 2.982139825820923, + "kl": 24.578125, + "learning_rate": 2.682112722291186e-07, + "loss": 2.1211, + "num_tokens": 52565012.0, + "reward": 1.46875, + "reward_std": 0.852589026093483, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4625816270709038, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.40487170964479446, + "step": 1868, + "token_counts/after_target": 973.25, + "token_counts/after_think": 25.0, + "token_counts/before_target": 1907.5, + "token_counts/before_think": 969.25 + }, + { + "avg_penalty/after_target": 1.9308162033557892, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.33735988661646843, + "avg_penalty/before_think": 0.5397751182317734, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 728.75, + "completions/max_terminated_length": 596.25, + "completions/mean_length": 240.453125, + "completions/mean_terminated_length": 226.85625076293945, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.9345, + "grad_norm": 8.560347557067871, + "kl": 26.40625, + "learning_rate": 2.6421097126839714e-07, + "loss": 1.9708, + "num_tokens": 52590801.0, + "reward": 1.3671875, + "reward_std": 0.9102939367294312, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.48456869274377823, + "rewards/tag_count_reward/mean": 0.7109375, + "rewards/tag_count_reward/std": 0.4380369037389755, + "step": 1869, + "token_counts/after_target": 603.25, + "token_counts/after_think": 119.75, + "token_counts/before_target": 2054.75, + "token_counts/before_think": 1069.5 + }, + { + "avg_penalty/after_target": 2.248379111289978, + "avg_penalty/after_think": 3.9187320470809937, + "avg_penalty/before_target": 0.3854983076453209, + "avg_penalty/before_think": 0.4989674761891365, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.25, + "completions/max_terminated_length": 646.25, + "completions/mean_length": 251.3125, + "completions/mean_terminated_length": 251.3125, + "completions/min_length": 33.25, + "completions/min_terminated_length": 33.25, + "epoch": 0.935, + "grad_norm": 6.661288261413574, + "kl": 23.5, + "learning_rate": 2.6024032720948446e-07, + "loss": 1.8167, + "num_tokens": 52618005.0, + "reward": 1.40625, + "reward_std": 0.8891266286373138, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.48148179799318314, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.42768532782793045, + "step": 1870, + "token_counts/after_target": 495.5, + "token_counts/after_think": 269.5, + "token_counts/before_target": 2348.25, + "token_counts/before_think": 907.75 + }, + { + "avg_penalty/after_target": 2.694843828678131, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.32204949855804443, + "avg_penalty/before_think": 0.776955746114254, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 733.5, + "completions/max_terminated_length": 733.5, + "completions/mean_length": 230.453125, + "completions/mean_terminated_length": 230.453125, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.9355, + "grad_norm": 8.083914756774902, + "kl": 15.28125, + "learning_rate": 2.5629935214764866e-07, + "loss": 1.5526, + "num_tokens": 52646706.0, + "reward": 1.54296875, + "reward_std": 0.8200551122426987, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4440634250640869, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.39505382627248764, + "step": 1871, + "token_counts/after_target": 979.0, + "token_counts/after_think": 86.5, + "token_counts/before_target": 1505.5, + "token_counts/before_think": 1116.25 + }, + { + "avg_penalty/after_target": 1.5580914616584778, + "avg_penalty/after_think": 3.613077759742737, + "avg_penalty/before_target": 0.47831184417009354, + "avg_penalty/before_think": 0.38126903772354126, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 706.5, + "completions/max_terminated_length": 706.5, + "completions/mean_length": 198.078125, + "completions/mean_terminated_length": 198.078125, + "completions/min_length": 39.75, + "completions/min_terminated_length": 39.75, + "epoch": 0.936, + "grad_norm": 6.025538921356201, + "kl": 17.0703125, + "learning_rate": 2.523880580877824e-07, + "loss": 1.6597, + "num_tokens": 52669767.0, + "reward": 1.65625, + "reward_std": 0.6758195161819458, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3837348371744156, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.2979462891817093, + "step": 1872, + "token_counts/after_target": 474.0, + "token_counts/after_think": 109.25, + "token_counts/before_target": 1512.75, + "token_counts/before_think": 1073.25 + }, + { + "avg_penalty/after_target": 1.643177330493927, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4881255552172661, + "avg_penalty/before_think": 0.458699993789196, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 813.5, + "completions/max_terminated_length": 718.25, + "completions/mean_length": 251.59375, + "completions/mean_terminated_length": 229.02291870117188, + "completions/min_length": 40.75, + "completions/min_terminated_length": 40.75, + "epoch": 0.9365, + "grad_norm": 9.059829711914062, + "kl": 23.4921875, + "learning_rate": 2.4850645694436736e-07, + "loss": 1.7877, + "num_tokens": 52694925.0, + "reward": 1.54296875, + "reward_std": 0.6503110826015472, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.3780868947505951, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.29007405787706375, + "step": 1873, + "token_counts/after_target": 808.75, + "token_counts/after_think": 73.0, + "token_counts/before_target": 2177.75, + "token_counts/before_think": 966.0 + }, + { + "avg_penalty/after_target": 2.166040986776352, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4119243919849396, + "avg_penalty/before_think": 0.4252297356724739, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.25, + "completions/max_terminated_length": 673.25, + "completions/mean_length": 220.203125, + "completions/mean_terminated_length": 220.203125, + "completions/min_length": 42.25, + "completions/min_terminated_length": 42.25, + "epoch": 0.937, + "grad_norm": 6.69069766998291, + "kl": 19.984375, + "learning_rate": 2.446545605414341e-07, + "loss": 1.5429, + "num_tokens": 52718522.0, + "reward": 1.51171875, + "reward_std": 0.7922469079494476, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.42430340498685837, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.3857579156756401, + "step": 1874, + "token_counts/after_target": 586.75, + "token_counts/after_think": 24.25, + "token_counts/before_target": 1665.5, + "token_counts/before_think": 1246.75 + }, + { + "avg_penalty/after_target": 2.63490429520607, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.5536040849983692, + "avg_penalty/before_think": 0.4412877522408962, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 773.75, + "completions/max_terminated_length": 773.75, + "completions/mean_length": 269.125, + "completions/mean_terminated_length": 269.125, + "completions/min_length": 48.75, + "completions/min_terminated_length": 48.75, + "epoch": 0.9375, + "grad_norm": 2.4430322647094727, + "kl": 22.23828125, + "learning_rate": 2.4083238061252565e-07, + "loss": 1.9787, + "num_tokens": 52744498.0, + "reward": 1.55078125, + "reward_std": 0.6693987548351288, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.36180340498685837, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.32401271164417267, + "step": 1875, + "token_counts/after_target": 1404.5, + "token_counts/after_think": 18.25, + "token_counts/before_target": 2027.5, + "token_counts/before_think": 855.75 + }, + { + "avg_penalty/after_target": 1.972460001707077, + "avg_penalty/after_think": 3.6481680274009705, + "avg_penalty/before_target": 0.35448887571692467, + "avg_penalty/before_think": 0.427372008562088, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 560.0, + "completions/max_terminated_length": 560.0, + "completions/mean_length": 215.453125, + "completions/mean_terminated_length": 215.453125, + "completions/min_length": 40.5, + "completions/min_terminated_length": 40.5, + "epoch": 0.938, + "grad_norm": 8.787372589111328, + "kl": 21.546875, + "learning_rate": 2.370399288006664e-07, + "loss": 1.5794, + "num_tokens": 52766911.0, + "reward": 1.453125, + "reward_std": 0.8006723523139954, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.44938503205776215, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.3804534077644348, + "step": 1876, + "token_counts/after_target": 451.75, + "token_counts/after_think": 27.5, + "token_counts/before_target": 2033.75, + "token_counts/before_think": 934.25 + }, + { + "avg_penalty/after_target": 2.006700724363327, + "avg_penalty/after_think": 3.4089344143867493, + "avg_penalty/before_target": 0.3613607697188854, + "avg_penalty/before_think": 0.4673324264585972, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.5, + "completions/max_terminated_length": 519.5, + "completions/mean_length": 176.921875, + "completions/mean_terminated_length": 176.921875, + "completions/min_length": 31.25, + "completions/min_terminated_length": 31.25, + "epoch": 0.9385, + "grad_norm": 3.5148355960845947, + "kl": 22.09375, + "learning_rate": 2.332772166583208e-07, + "loss": 1.8159, + "num_tokens": 52786714.0, + "reward": 1.58984375, + "reward_std": 0.7528425753116608, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4255262687802315, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.3715926334261894, + "step": 1877, + "token_counts/after_target": 426.0, + "token_counts/after_think": 79.0, + "token_counts/before_target": 1589.0, + "token_counts/before_think": 736.75 + }, + { + "avg_penalty/after_target": 2.177429437637329, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.5211850926280022, + "avg_penalty/before_think": 0.5230404511094093, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 597.25, + "completions/max_terminated_length": 597.25, + "completions/mean_length": 198.546875, + "completions/mean_terminated_length": 198.546875, + "completions/min_length": 48.5, + "completions/min_terminated_length": 48.5, + "epoch": 0.939, + "grad_norm": 7.16335391998291, + "kl": 17.703125, + "learning_rate": 2.295442556473637e-07, + "loss": 1.7699, + "num_tokens": 52808685.0, + "reward": 1.66796875, + "reward_std": 0.6860391497612, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.87109375, + "rewards/tag_count_reward/std": 0.29673265293240547, + "step": 1878, + "token_counts/after_target": 446.25, + "token_counts/after_think": 188.75, + "token_counts/before_target": 1685.5, + "token_counts/before_think": 856.25 + }, + { + "avg_penalty/after_target": 2.62765434384346, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.37458305433392525, + "avg_penalty/before_think": 0.6302100643515587, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.25, + "completions/max_terminated_length": 582.25, + "completions/mean_length": 210.078125, + "completions/mean_terminated_length": 210.078125, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.9395, + "grad_norm": 7.4598846435546875, + "kl": 18.828125, + "learning_rate": 2.2584105713904126e-07, + "loss": 1.8972, + "num_tokens": 52831538.0, + "reward": 1.59375, + "reward_std": 0.7698181569576263, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.42867646366357803, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.36224547028541565, + "step": 1879, + "token_counts/after_target": 671.25, + "token_counts/after_think": 82.75, + "token_counts/before_target": 1636.5, + "token_counts/before_think": 970.75 + }, + { + "avg_penalty/after_target": 2.467797577381134, + "avg_penalty/after_think": 0.7592735290527344, + "avg_penalty/before_target": 0.47121959924697876, + "avg_penalty/before_think": 0.46831682696938515, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 595.25, + "completions/max_terminated_length": 595.25, + "completions/mean_length": 265.5625, + "completions/mean_terminated_length": 265.5625, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.94, + "grad_norm": 5.434757709503174, + "kl": 21.25, + "learning_rate": 2.221676324139377e-07, + "loss": 1.713, + "num_tokens": 52858614.0, + "reward": 1.4609375, + "reward_std": 0.8525470495223999, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4682852029800415, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.4052830636501312, + "step": 1880, + "token_counts/after_target": 1096.25, + "token_counts/after_think": 5.5, + "token_counts/before_target": 1870.5, + "token_counts/before_think": 1276.75 + }, + { + "avg_penalty/after_target": 3.089720904827118, + "avg_penalty/after_think": 2.8071306347846985, + "avg_penalty/before_target": 0.27946148812770844, + "avg_penalty/before_think": 0.37924060225486755, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 173.484375, + "completions/mean_terminated_length": 173.484375, + "completions/min_length": 43.75, + "completions/min_terminated_length": 43.75, + "epoch": 0.9405, + "grad_norm": 6.584305286407471, + "kl": 24.46875, + "learning_rate": 2.1852399266194312e-07, + "loss": 1.9061, + "num_tokens": 52879301.0, + "reward": 1.47265625, + "reward_std": 0.8851542919874191, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4625816270709038, + "rewards/tag_count_reward/mean": 0.75390625, + "rewards/tag_count_reward/std": 0.42448709160089493, + "step": 1881, + "token_counts/after_target": 444.0, + "token_counts/after_think": 36.5, + "token_counts/before_target": 1611.5, + "token_counts/before_think": 683.75 + }, + { + "avg_penalty/after_target": 2.327208489179611, + "avg_penalty/after_think": 3.9981465935707092, + "avg_penalty/before_target": 0.516102209687233, + "avg_penalty/before_think": 0.5192352905869484, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 625.75, + "completions/max_terminated_length": 532.25, + "completions/mean_length": 238.65625, + "completions/mean_terminated_length": 226.56250381469727, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.941, + "grad_norm": 3.6142172813415527, + "kl": 20.59375, + "learning_rate": 2.1491014898221585e-07, + "loss": 1.847, + "num_tokens": 52904911.0, + "reward": 1.60546875, + "reward_std": 0.7636734545230865, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.422013059258461, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.35774407535791397, + "step": 1882, + "token_counts/after_target": 872.0, + "token_counts/after_think": 169.0, + "token_counts/before_target": 1638.25, + "token_counts/before_think": 1139.25 + }, + { + "avg_penalty/after_target": 1.990856647491455, + "avg_penalty/after_think": 2.763281226158142, + "avg_penalty/before_target": 0.34651029482483864, + "avg_penalty/before_think": 0.48876257985830307, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.75, + "completions/max_terminated_length": 677.75, + "completions/mean_length": 204.921875, + "completions/mean_terminated_length": 204.921875, + "completions/min_length": 68.75, + "completions/min_terminated_length": 68.75, + "epoch": 0.9415, + "grad_norm": 10.370560646057129, + "kl": 22.125, + "learning_rate": 2.1132611238315004e-07, + "loss": 1.6147, + "num_tokens": 52929050.0, + "reward": 1.484375, + "reward_std": 0.8533343225717545, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4604102149605751, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.40370409935712814, + "step": 1883, + "token_counts/after_target": 425.25, + "token_counts/after_think": 48.25, + "token_counts/before_target": 1864.25, + "token_counts/before_think": 941.0 + }, + { + "avg_penalty/after_target": 1.862490177154541, + "avg_penalty/after_think": 2.930522859096527, + "avg_penalty/before_target": 0.4743267223238945, + "avg_penalty/before_think": 0.6003572568297386, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 604.0, + "completions/max_terminated_length": 604.0, + "completions/mean_length": 264.703125, + "completions/mean_terminated_length": 264.703125, + "completions/min_length": 44.75, + "completions/min_terminated_length": 44.75, + "epoch": 0.942, + "grad_norm": 9.614221572875977, + "kl": 24.765625, + "learning_rate": 2.077718937823414e-07, + "loss": 1.9067, + "num_tokens": 52958535.0, + "reward": 1.40234375, + "reward_std": 0.8589863628149033, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.46566852182149887, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.41232746839523315, + "step": 1884, + "token_counts/after_target": 751.75, + "token_counts/after_think": 103.25, + "token_counts/before_target": 2276.0, + "token_counts/before_think": 1104.25 + }, + { + "avg_penalty/after_target": 1.5715100765228271, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4312022477388382, + "avg_penalty/before_think": 0.3336438164114952, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 647.5, + "completions/max_terminated_length": 519.75, + "completions/mean_length": 199.890625, + "completions/mean_terminated_length": 187.51667022705078, + "completions/min_length": 42.75, + "completions/min_terminated_length": 42.75, + "epoch": 0.9425, + "grad_norm": 6.303762912750244, + "kl": 19.53125, + "learning_rate": 2.0424750400655947e-07, + "loss": 1.5498, + "num_tokens": 52984256.0, + "reward": 1.57421875, + "reward_std": 0.7498749792575836, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4136601909995079, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.35008007287979126, + "step": 1885, + "token_counts/after_target": 482.25, + "token_counts/after_think": 30.75, + "token_counts/before_target": 1723.0, + "token_counts/before_think": 962.25 + }, + { + "avg_penalty/after_target": 2.8441527485847473, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.31050366908311844, + "avg_penalty/before_think": 0.5779256299138069, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 595.25, + "completions/max_terminated_length": 595.25, + "completions/mean_length": 246.515625, + "completions/mean_terminated_length": 246.515625, + "completions/min_length": 59.75, + "completions/min_terminated_length": 59.75, + "epoch": 0.943, + "grad_norm": 3.155280351638794, + "kl": 19.671875, + "learning_rate": 2.0075295379170413e-07, + "loss": 1.7704, + "num_tokens": 53008081.0, + "reward": 1.5859375, + "reward_std": 0.7464170902967453, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4255262687802315, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.35138915479183197, + "step": 1886, + "token_counts/after_target": 920.25, + "token_counts/after_think": 12.0, + "token_counts/before_target": 1722.75, + "token_counts/before_think": 1289.25 + }, + { + "avg_penalty/after_target": 2.432209074497223, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.31833329796791077, + "avg_penalty/before_think": 0.5378719493746758, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.5, + "completions/max_terminated_length": 567.5, + "completions/mean_length": 226.65625, + "completions/mean_terminated_length": 226.65625, + "completions/min_length": 40.25, + "completions/min_terminated_length": 40.25, + "epoch": 0.9435, + "grad_norm": 10.113119125366211, + "kl": 23.75, + "learning_rate": 1.9728825378278248e-07, + "loss": 1.7665, + "num_tokens": 53037531.0, + "reward": 1.42578125, + "reward_std": 0.8115631192922592, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.4546433389186859, + "rewards/tag_count_reward/mean": 0.75390625, + "rewards/tag_count_reward/std": 0.3825000561773777, + "step": 1887, + "token_counts/after_target": 530.0, + "token_counts/after_think": 75.25, + "token_counts/before_target": 1782.5, + "token_counts/before_think": 1238.75 + }, + { + "avg_penalty/after_target": 2.860215663909912, + "avg_penalty/after_think": 2.949370324611664, + "avg_penalty/before_target": 0.37960005924105644, + "avg_penalty/before_think": 0.4731728918850422, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 538.0, + "completions/max_terminated_length": 538.0, + "completions/mean_length": 202.015625, + "completions/mean_terminated_length": 202.015625, + "completions/min_length": 31.75, + "completions/min_terminated_length": 31.75, + "epoch": 0.944, + "grad_norm": 3.506093740463257, + "kl": 14.09375, + "learning_rate": 1.9385341453386997e-07, + "loss": 1.3121, + "num_tokens": 53063020.0, + "reward": 1.61328125, + "reward_std": 0.7567105442285538, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4255262687802315, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.33199920505285263, + "step": 1888, + "token_counts/after_target": 503.25, + "token_counts/after_think": 88.25, + "token_counts/before_target": 1582.5, + "token_counts/before_think": 1058.25 + }, + { + "avg_penalty/after_target": 2.342393070459366, + "avg_penalty/after_think": 1.6813509464263916, + "avg_penalty/before_target": 0.45506012439727783, + "avg_penalty/before_think": 0.4058004394173622, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 686.0, + "completions/max_terminated_length": 597.75, + "completions/mean_length": 198.265625, + "completions/mean_terminated_length": 185.44166946411133, + "completions/min_length": 48.25, + "completions/min_terminated_length": 48.25, + "epoch": 0.9445, + "grad_norm": 10.640490531921387, + "kl": 20.03125, + "learning_rate": 1.9044844650808468e-07, + "loss": 1.9347, + "num_tokens": 53088413.0, + "reward": 1.6484375, + "reward_std": 0.7088254988193512, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.39476002007722855, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.32957057654857635, + "step": 1889, + "token_counts/after_target": 561.5, + "token_counts/after_think": 55.25, + "token_counts/before_target": 1515.0, + "token_counts/before_think": 1040.5 + }, + { + "avg_penalty/after_target": 1.9782337248325348, + "avg_penalty/after_think": 3.8982452154159546, + "avg_penalty/before_target": 0.30103714019060135, + "avg_penalty/before_think": 0.4571961537003517, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.75, + "completions/max_terminated_length": 490.75, + "completions/mean_length": 193.359375, + "completions/mean_terminated_length": 193.359375, + "completions/min_length": 42.75, + "completions/min_terminated_length": 42.75, + "epoch": 0.945, + "grad_norm": 7.867733001708984, + "kl": 22.265625, + "learning_rate": 1.8707336007754873e-07, + "loss": 1.6717, + "num_tokens": 53111396.0, + "reward": 1.5546875, + "reward_std": 0.7327546775341034, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.41898179799318314, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.32710785418748856, + "step": 1890, + "token_counts/after_target": 384.25, + "token_counts/after_think": 85.75, + "token_counts/before_target": 1719.25, + "token_counts/before_think": 904.5 + }, + { + "avg_penalty/after_target": 2.0908525586128235, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3763969726860523, + "avg_penalty/before_think": 0.5251865461468697, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.75, + "completions/max_terminated_length": 568.75, + "completions/mean_length": 242.359375, + "completions/mean_terminated_length": 242.359375, + "completions/min_length": 47.75, + "completions/min_terminated_length": 47.75, + "epoch": 0.9455, + "grad_norm": 7.437994956970215, + "kl": 22.28125, + "learning_rate": 1.8372816552336025e-07, + "loss": 1.7032, + "num_tokens": 53138459.0, + "reward": 1.40234375, + "reward_std": 0.8900370299816132, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.48558124154806137, + "rewards/tag_count_reward/mean": 0.73046875, + "rewards/tag_count_reward/std": 0.3932846188545227, + "step": 1891, + "token_counts/after_target": 691.0, + "token_counts/after_think": 88.5, + "token_counts/before_target": 2021.75, + "token_counts/before_think": 1076.5 + }, + { + "avg_penalty/after_target": 2.237099677324295, + "avg_penalty/after_think": 3.6410603523254395, + "avg_penalty/before_target": 0.4902326390147209, + "avg_penalty/before_think": 0.409954059869051, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 581.0, + "completions/max_terminated_length": 581.0, + "completions/mean_length": 183.890625, + "completions/mean_terminated_length": 183.890625, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.946, + "grad_norm": 5.352094650268555, + "kl": 23.671875, + "learning_rate": 1.8041287303556366e-07, + "loss": 2.1653, + "num_tokens": 53162436.0, + "reward": 1.53125, + "reward_std": 0.7862334996461868, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.404181070625782, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.3936670422554016, + "step": 1892, + "token_counts/after_target": 840.0, + "token_counts/after_think": 65.0, + "token_counts/before_target": 1555.75, + "token_counts/before_think": 481.5 + }, + { + "avg_penalty/after_target": 2.3681753277778625, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.40854786336421967, + "avg_penalty/before_think": 0.6132830046117306, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.5, + "completions/max_terminated_length": 651.5, + "completions/mean_length": 236.25, + "completions/mean_terminated_length": 236.25, + "completions/min_length": 45.5, + "completions/min_terminated_length": 45.5, + "epoch": 0.9465, + "grad_norm": 3.151745080947876, + "kl": 24.21875, + "learning_rate": 1.7712749271311392e-07, + "loss": 2.0782, + "num_tokens": 53191780.0, + "reward": 1.421875, + "reward_std": 0.8351188749074936, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.45028156042099, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.408820778131485, + "step": 1893, + "token_counts/after_target": 825.25, + "token_counts/after_think": 51.5, + "token_counts/before_target": 2007.5, + "token_counts/before_think": 895.75 + }, + { + "avg_penalty/after_target": 2.1022607684135437, + "avg_penalty/after_think": 2.8034790754318237, + "avg_penalty/before_target": 0.41753075644373894, + "avg_penalty/before_think": 0.4625480845570564, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 647.0, + "completions/max_terminated_length": 593.75, + "completions/mean_length": 221.84375, + "completions/mean_terminated_length": 210.51458740234375, + "completions/min_length": 39.5, + "completions/min_terminated_length": 39.5, + "epoch": 0.947, + "grad_norm": 2.8346493244171143, + "kl": 19.25, + "learning_rate": 1.7387203456384784e-07, + "loss": 1.7458, + "num_tokens": 53213866.0, + "reward": 1.59375, + "reward_std": 0.7729007005691528, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.42516325414180756, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.3654702752828598, + "step": 1894, + "token_counts/after_target": 564.0, + "token_counts/after_think": 63.25, + "token_counts/before_target": 1369.5, + "token_counts/before_think": 1552.75 + }, + { + "avg_penalty/after_target": 2.587328553199768, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.30534449219703674, + "avg_penalty/before_think": 0.5679627433419228, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 174.4375, + "completions/mean_terminated_length": 174.4375, + "completions/min_length": 47.5, + "completions/min_terminated_length": 47.5, + "epoch": 0.9475, + "grad_norm": 3.427304267883301, + "kl": 19.140625, + "learning_rate": 1.706465085044584e-07, + "loss": 1.6186, + "num_tokens": 53234646.0, + "reward": 1.5234375, + "reward_std": 0.8770487159490585, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.45283494144678116, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.39520320296287537, + "step": 1895, + "token_counts/after_target": 427.0, + "token_counts/after_think": 43.0, + "token_counts/before_target": 1462.5, + "token_counts/before_think": 858.5 + }, + { + "avg_penalty/after_target": 2.064325511455536, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.44427232444286346, + "avg_penalty/before_think": 0.5437817052006721, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 560.25, + "completions/max_terminated_length": 560.25, + "completions/mean_length": 240.03125, + "completions/mean_terminated_length": 240.03125, + "completions/min_length": 35.25, + "completions/min_terminated_length": 35.25, + "epoch": 0.948, + "grad_norm": 6.174627780914307, + "kl": 22.21875, + "learning_rate": 1.6745092436045495e-07, + "loss": 1.7215, + "num_tokens": 53259400.0, + "reward": 1.4375, + "reward_std": 0.8895306140184402, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.48148179799318314, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.4082561135292053, + "step": 1896, + "token_counts/after_target": 769.5, + "token_counts/after_think": 48.0, + "token_counts/before_target": 2147.25, + "token_counts/before_think": 875.75 + }, + { + "avg_penalty/after_target": 2.0329218804836273, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4220019094645977, + "avg_penalty/before_think": 0.42107850313186646, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 691.75, + "completions/max_terminated_length": 551.0, + "completions/mean_length": 226.71875, + "completions/mean_terminated_length": 213.17708587646484, + "completions/min_length": 41.25, + "completions/min_terminated_length": 41.25, + "epoch": 0.9485, + "grad_norm": 5.4315619468688965, + "kl": 24.0625, + "learning_rate": 1.6428529186614195e-07, + "loss": 1.9167, + "num_tokens": 53283158.0, + "reward": 1.4921875, + "reward_std": 0.8067768961191177, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.46513500809669495, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.3639821633696556, + "step": 1897, + "token_counts/after_target": 651.0, + "token_counts/after_think": 80.25, + "token_counts/before_target": 1852.0, + "token_counts/before_think": 1044.25 + }, + { + "avg_penalty/after_target": 2.623130202293396, + "avg_penalty/after_think": 3.4367400407791138, + "avg_penalty/before_target": 0.3983324132859707, + "avg_penalty/before_think": 0.39133312553167343, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.25, + "completions/max_terminated_length": 547.25, + "completions/mean_length": 193.90625, + "completions/mean_terminated_length": 193.90625, + "completions/min_length": 58.5, + "completions/min_terminated_length": 58.5, + "epoch": 0.949, + "grad_norm": 3.1917734146118164, + "kl": 20.28125, + "learning_rate": 1.6114962066458351e-07, + "loss": 1.7604, + "num_tokens": 53304352.0, + "reward": 1.59765625, + "reward_std": 0.7892830222845078, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.3979102149605751, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.3918500915169716, + "step": 1898, + "token_counts/after_target": 510.5, + "token_counts/after_think": 60.25, + "token_counts/before_target": 1738.0, + "token_counts/before_think": 793.75 + }, + { + "avg_penalty/after_target": 2.200644224882126, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.35084670782089233, + "avg_penalty/before_think": 0.49681633710861206, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 201.234375, + "completions/mean_terminated_length": 201.234375, + "completions/min_length": 48.25, + "completions/min_terminated_length": 48.25, + "epoch": 0.9495, + "grad_norm": 4.0924177169799805, + "kl": 13.6875, + "learning_rate": 1.580439203075812e-07, + "loss": 1.325, + "num_tokens": 53328767.0, + "reward": 1.76171875, + "reward_std": 0.7550032138824463, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.17430340498685837, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3837348371744156, + "rewards/tag_count_reward/mean": 0.87109375, + "rewards/tag_count_reward/std": 0.32426538318395615, + "step": 1899, + "token_counts/after_target": 665.75, + "token_counts/after_think": 117.0, + "token_counts/before_target": 1621.75, + "token_counts/before_think": 815.25 + }, + { + "avg_penalty/after_target": 3.151456832885742, + "avg_penalty/after_think": 1.9426382780075073, + "avg_penalty/before_target": 0.331205353140831, + "avg_penalty/before_think": 0.49864012002944946, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.5, + "completions/max_terminated_length": 479.5, + "completions/mean_length": 204.84375, + "completions/mean_terminated_length": 204.84375, + "completions/min_length": 54.5, + "completions/min_terminated_length": 54.5, + "epoch": 0.95, + "grad_norm": 2.605498790740967, + "kl": 22.59375, + "learning_rate": 1.549682002556341e-07, + "loss": 1.9211, + "num_tokens": 53353205.0, + "reward": 1.4375, + "reward_std": 0.8444130420684814, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4613594636321068, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.4039152190089226, + "step": 1900, + "token_counts/after_target": 751.75, + "token_counts/after_think": 19.75, + "token_counts/before_target": 1806.75, + "token_counts/before_think": 699.25 + }, + { + "avg_penalty/after_target": 1.766701728105545, + "avg_penalty/after_think": 2.4868215322494507, + "avg_penalty/before_target": 0.5623980984091759, + "avg_penalty/before_think": 0.42930086702108383, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 595.75, + "completions/max_terminated_length": 531.5, + "completions/mean_length": 220.65625, + "completions/mean_terminated_length": 208.60000228881836, + "completions/min_length": 46.75, + "completions/min_terminated_length": 46.75, + "epoch": 0.9505, + "grad_norm": 3.3920040130615234, + "kl": 22.265625, + "learning_rate": 1.519224698779198e-07, + "loss": 1.8491, + "num_tokens": 53376639.0, + "reward": 1.5078125, + "reward_std": 0.8266905397176743, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4339347705245018, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.40308573096990585, + "step": 1901, + "token_counts/after_target": 735.25, + "token_counts/after_think": 68.75, + "token_counts/before_target": 1690.25, + "token_counts/before_think": 1036.25 + }, + { + "avg_penalty/after_target": 3.176036834716797, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.39756499975919724, + "avg_penalty/before_think": 0.6285748332738876, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 595.25, + "completions/max_terminated_length": 595.25, + "completions/mean_length": 226.28125, + "completions/mean_terminated_length": 226.28125, + "completions/min_length": 46.5, + "completions/min_terminated_length": 46.5, + "epoch": 0.951, + "grad_norm": 3.188052177429199, + "kl": 23.015625, + "learning_rate": 1.4890673845226133e-07, + "loss": 1.9824, + "num_tokens": 53401617.0, + "reward": 1.359375, + "reward_std": 0.8754878789186478, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.4745560586452484, + "rewards/tag_count_reward/mean": 0.703125, + "rewards/tag_count_reward/std": 0.4173092618584633, + "step": 1902, + "token_counts/after_target": 944.75, + "token_counts/after_think": 91.0, + "token_counts/before_target": 1591.75, + "token_counts/before_think": 993.0 + }, + { + "avg_penalty/after_target": 2.1246972382068634, + "avg_penalty/after_think": 3.9849679470062256, + "avg_penalty/before_target": 0.3008105605840683, + "avg_penalty/before_think": 0.5669123083353043, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 688.25, + "completions/max_terminated_length": 688.25, + "completions/mean_length": 216.375, + "completions/mean_terminated_length": 216.375, + "completions/min_length": 47.75, + "completions/min_terminated_length": 47.75, + "epoch": 0.9515, + "grad_norm": 9.616724014282227, + "kl": 22.5, + "learning_rate": 1.4592101516509916e-07, + "loss": 1.7974, + "num_tokens": 53425033.0, + "reward": 1.54296875, + "reward_std": 0.8043068200349808, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.41503459960222244, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.3957827091217041, + "step": 1903, + "token_counts/after_target": 352.25, + "token_counts/after_think": 216.75, + "token_counts/before_target": 1968.25, + "token_counts/before_think": 924.75 + }, + { + "avg_penalty/after_target": 2.664582461118698, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.39877060800790787, + "avg_penalty/before_think": 0.3784286677837372, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.25, + "completions/max_terminated_length": 500.25, + "completions/mean_length": 221.765625, + "completions/mean_terminated_length": 221.765625, + "completions/min_length": 50.5, + "completions/min_terminated_length": 50.5, + "epoch": 0.952, + "grad_norm": 6.8457536697387695, + "kl": 16.703125, + "learning_rate": 1.4296530911146466e-07, + "loss": 1.6529, + "num_tokens": 53448298.0, + "reward": 1.6640625, + "reward_std": 0.7079028785228729, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3987511098384857, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.33164968341588974, + "step": 1904, + "token_counts/after_target": 813.5, + "token_counts/after_think": 12.25, + "token_counts/before_target": 1670.5, + "token_counts/before_think": 1052.0 + }, + { + "avg_penalty/after_target": 3.2010215520858765, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.29380226135253906, + "avg_penalty/before_think": 0.28656506910920143, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 176.109375, + "completions/mean_terminated_length": 176.109375, + "completions/min_length": 47.75, + "completions/min_terminated_length": 47.75, + "epoch": 0.9525, + "grad_norm": 3.7246925830841064, + "kl": 23.1875, + "learning_rate": 1.400396292949513e-07, + "loss": 1.9817, + "num_tokens": 53471777.0, + "reward": 1.3828125, + "reward_std": 0.8958399593830109, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.48680340498685837, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.4288243129849434, + "step": 1905, + "token_counts/after_target": 556.0, + "token_counts/after_think": 85.75, + "token_counts/before_target": 1640.5, + "token_counts/before_think": 535.5 + }, + { + "avg_penalty/after_target": 2.7352045476436615, + "avg_penalty/after_think": 3.2652695178985596, + "avg_penalty/before_target": 0.3403012901544571, + "avg_penalty/before_think": 0.45791228860616684, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 642.75, + "completions/max_terminated_length": 537.5, + "completions/mean_length": 206.421875, + "completions/mean_terminated_length": 193.61979293823242, + "completions/min_length": 47.75, + "completions/min_terminated_length": 47.75, + "epoch": 0.953, + "grad_norm": 3.031984329223633, + "kl": 25.96875, + "learning_rate": 1.3714398462768563e-07, + "loss": 2.2053, + "num_tokens": 53494332.0, + "reward": 1.50390625, + "reward_std": 0.831280305981636, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.46296359598636627, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.38328002393245697, + "step": 1906, + "token_counts/after_target": 654.5, + "token_counts/after_think": 31.0, + "token_counts/before_target": 1793.0, + "token_counts/before_think": 824.25 + }, + { + "avg_penalty/after_target": 1.9571375250816345, + "avg_penalty/after_think": 3.9379385113716125, + "avg_penalty/before_target": 0.4718356877565384, + "avg_penalty/before_think": 0.5526544526219368, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.75, + "completions/max_terminated_length": 703.75, + "completions/mean_length": 266.328125, + "completions/mean_terminated_length": 266.328125, + "completions/min_length": 58.25, + "completions/min_terminated_length": 58.25, + "epoch": 0.9535, + "grad_norm": 3.4644806385040283, + "kl": 20.78125, + "learning_rate": 1.3427838393030634e-07, + "loss": 1.7417, + "num_tokens": 53522417.0, + "reward": 1.52734375, + "reward_std": 0.8602030873298645, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44938503205776215, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.40711675584316254, + "step": 1907, + "token_counts/after_target": 856.75, + "token_counts/after_think": 108.0, + "token_counts/before_target": 1981.25, + "token_counts/before_think": 1315.25 + }, + { + "avg_penalty/after_target": 2.0811149179935455, + "avg_penalty/after_think": 2.1859167218208313, + "avg_penalty/before_target": 0.533949576318264, + "avg_penalty/before_think": 0.5744996815919876, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 697.5, + "completions/max_terminated_length": 697.5, + "completions/mean_length": 186.671875, + "completions/mean_terminated_length": 186.671875, + "completions/min_length": 31.5, + "completions/min_terminated_length": 31.5, + "epoch": 0.954, + "grad_norm": 4.934867858886719, + "kl": 23.8125, + "learning_rate": 1.3144283593192752e-07, + "loss": 1.9764, + "num_tokens": 53544876.0, + "reward": 1.55078125, + "reward_std": 0.7818303406238556, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4106728211045265, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.3871879354119301, + "step": 1908, + "token_counts/after_target": 568.25, + "token_counts/after_think": 27.5, + "token_counts/before_target": 1598.25, + "token_counts/before_think": 792.75 + }, + { + "avg_penalty/after_target": 2.8927815556526184, + "avg_penalty/after_think": 3.987315356731415, + "avg_penalty/before_target": 0.3123590163886547, + "avg_penalty/before_think": 0.5530594512820244, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.25, + "completions/max_terminated_length": 614.25, + "completions/mean_length": 195.328125, + "completions/mean_terminated_length": 195.328125, + "completions/min_length": 34.75, + "completions/min_terminated_length": 34.75, + "epoch": 0.9545, + "grad_norm": 3.5417439937591553, + "kl": 15.49609375, + "learning_rate": 1.2863734927012094e-07, + "loss": 1.317, + "num_tokens": 53567889.0, + "reward": 1.73046875, + "reward_std": 0.5734735578298569, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.31687305867671967, + "rewards/tag_count_reward/mean": 0.85546875, + "rewards/tag_count_reward/std": 0.2681359797716141, + "step": 1909, + "token_counts/after_target": 452.0, + "token_counts/after_think": 102.0, + "token_counts/before_target": 1402.0, + "token_counts/before_think": 1169.25 + }, + { + "avg_penalty/after_target": 2.3365316092967987, + "avg_penalty/after_think": 2.9766828417778015, + "avg_penalty/before_target": 0.32597053050994873, + "avg_penalty/before_think": 0.46576453745365143, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.5, + "completions/max_terminated_length": 598.5, + "completions/mean_length": 198.328125, + "completions/mean_terminated_length": 198.328125, + "completions/min_length": 23.25, + "completions/min_terminated_length": 23.25, + "epoch": 0.955, + "grad_norm": 3.7390778064727783, + "kl": 19.625, + "learning_rate": 1.2586193249088607e-07, + "loss": 1.5933, + "num_tokens": 53593622.0, + "reward": 1.4921875, + "reward_std": 0.8178307414054871, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.42430340498685837, + "rewards/tag_count_reward/mean": 0.7578125, + "rewards/tag_count_reward/std": 0.4029046967625618, + "step": 1910, + "token_counts/after_target": 368.75, + "token_counts/after_think": 80.75, + "token_counts/before_target": 2026.75, + "token_counts/before_think": 697.0 + }, + { + "avg_penalty/after_target": 2.8557136952877045, + "avg_penalty/after_think": 3.869507670402527, + "avg_penalty/before_target": 0.31122319400310516, + "avg_penalty/before_think": 0.45368270576000214, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 722.25, + "completions/max_terminated_length": 643.5, + "completions/mean_length": 244.625, + "completions/mean_terminated_length": 233.92708587646484, + "completions/min_length": 65.5, + "completions/min_terminated_length": 65.5, + "epoch": 0.9555, + "grad_norm": 2.8849782943725586, + "kl": 18.703125, + "learning_rate": 1.231165940486234e-07, + "loss": 1.6081, + "num_tokens": 53620190.0, + "reward": 1.50390625, + "reward_std": 0.8525655120611191, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4519384130835533, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.4085593745112419, + "step": 1911, + "token_counts/after_target": 533.25, + "token_counts/after_think": 86.0, + "token_counts/before_target": 1999.5, + "token_counts/before_think": 1295.25 + }, + { + "avg_penalty/after_target": 2.897200882434845, + "avg_penalty/after_think": 1.4481188356876373, + "avg_penalty/before_target": 0.44769179075956345, + "avg_penalty/before_think": 0.41590961813926697, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.5, + "completions/max_terminated_length": 656.5, + "completions/mean_length": 270.4375, + "completions/mean_terminated_length": 270.4375, + "completions/min_length": 49.75, + "completions/min_terminated_length": 49.75, + "epoch": 0.956, + "grad_norm": 4.233368873596191, + "kl": 22.1875, + "learning_rate": 1.2040134230610902e-07, + "loss": 1.8096, + "num_tokens": 53648090.0, + "reward": 1.48828125, + "reward_std": 0.8358847498893738, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.3939337432384491, + "step": 1912, + "token_counts/after_target": 851.5, + "token_counts/after_think": 17.75, + "token_counts/before_target": 2342.25, + "token_counts/before_think": 1115.5 + }, + { + "avg_penalty/after_target": 2.3230100274086, + "avg_penalty/after_think": 2.8934817910194397, + "avg_penalty/before_target": 0.3343803100287914, + "avg_penalty/before_think": 0.5891569703817368, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 181.8125, + "completions/mean_terminated_length": 181.8125, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.9565, + "grad_norm": 4.807729244232178, + "kl": 22.765625, + "learning_rate": 1.1771618553447217e-07, + "loss": 1.8541, + "num_tokens": 53667870.0, + "reward": 1.51171875, + "reward_std": 0.8051778823137283, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.42685678601264954, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.3934093564748764, + "step": 1913, + "token_counts/after_target": 249.0, + "token_counts/after_think": 53.25, + "token_counts/before_target": 1757.0, + "token_counts/before_think": 849.75 + }, + { + "avg_penalty/after_target": 1.6927973628044128, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.43011143803596497, + "avg_penalty/before_think": 0.5215176418423653, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.25, + "completions/max_terminated_length": 562.25, + "completions/mean_length": 229.140625, + "completions/mean_terminated_length": 229.140625, + "completions/min_length": 49.5, + "completions/min_terminated_length": 49.5, + "epoch": 0.957, + "grad_norm": 3.2546284198760986, + "kl": 17.546875, + "learning_rate": 1.1506113191316447e-07, + "loss": 1.489, + "num_tokens": 53692247.0, + "reward": 1.58203125, + "reward_std": 0.7429747879505157, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4383598491549492, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.3298487477004528, + "step": 1914, + "token_counts/after_target": 655.0, + "token_counts/after_think": 34.0, + "token_counts/before_target": 1896.75, + "token_counts/before_think": 1080.5 + }, + { + "avg_penalty/after_target": 2.024078756570816, + "avg_penalty/after_think": 3.7649786472320557, + "avg_penalty/before_target": 0.3660367578268051, + "avg_penalty/before_think": 0.3582102805376053, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 183.15625, + "completions/mean_terminated_length": 183.15625, + "completions/min_length": 43.5, + "completions/min_terminated_length": 43.5, + "epoch": 0.9575, + "grad_norm": 3.24404239654541, + "kl": 19.25, + "learning_rate": 1.1243618952994195e-07, + "loss": 1.6091, + "num_tokens": 53711713.0, + "reward": 1.54296875, + "reward_std": 0.8000508695840836, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4519384130835533, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.37000222504138947, + "step": 1915, + "token_counts/after_target": 445.25, + "token_counts/after_think": 23.25, + "token_counts/before_target": 1497.25, + "token_counts/before_think": 964.75 + }, + { + "avg_penalty/after_target": 2.0306944847106934, + "avg_penalty/after_think": 3.8141082525253296, + "avg_penalty/before_target": 0.42122800648212433, + "avg_penalty/before_think": 0.44927679002285004, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.25, + "completions/max_terminated_length": 572.25, + "completions/mean_length": 195.40625, + "completions/mean_terminated_length": 195.40625, + "completions/min_length": 50.5, + "completions/min_terminated_length": 50.5, + "epoch": 0.958, + "grad_norm": 4.752730369567871, + "kl": 21.453125, + "learning_rate": 1.0984136638083176e-07, + "loss": 1.973, + "num_tokens": 53741051.0, + "reward": 1.625, + "reward_std": 0.728366881608963, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.32909662649035454, + "step": 1916, + "token_counts/after_target": 480.5, + "token_counts/after_think": 126.25, + "token_counts/before_target": 1417.25, + "token_counts/before_think": 1102.5 + }, + { + "avg_penalty/after_target": 2.048409163951874, + "avg_penalty/after_think": 3.664998412132263, + "avg_penalty/before_target": 0.33321911096572876, + "avg_penalty/before_think": 0.5309322401881218, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 160.96875, + "completions/mean_terminated_length": 160.96875, + "completions/min_length": 37.25, + "completions/min_terminated_length": 37.25, + "epoch": 0.9585, + "grad_norm": 6.220645427703857, + "kl": 19.46875, + "learning_rate": 1.0727667037011668e-07, + "loss": 1.8253, + "num_tokens": 53762825.0, + "reward": 1.65625, + "reward_std": 0.6528115123510361, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.36967839300632477, + "rewards/tag_count_reward/mean": 0.859375, + "rewards/tag_count_reward/std": 0.3043528273701668, + "step": 1917, + "token_counts/after_target": 167.25, + "token_counts/after_think": 238.25, + "token_counts/before_target": 1347.0, + "token_counts/before_think": 823.0 + }, + { + "avg_penalty/after_target": 2.205032080411911, + "avg_penalty/after_think": 3.9228777289390564, + "avg_penalty/before_target": 0.37276221811771393, + "avg_penalty/before_think": 0.5524245649576187, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 670.5, + "completions/max_terminated_length": 670.5, + "completions/mean_length": 205.09375, + "completions/mean_terminated_length": 205.09375, + "completions/min_length": 52.75, + "completions/min_terminated_length": 52.75, + "epoch": 0.959, + "grad_norm": 4.910767078399658, + "kl": 23.78125, + "learning_rate": 1.0474210931030516e-07, + "loss": 1.9105, + "num_tokens": 53784991.0, + "reward": 1.56640625, + "reward_std": 0.7936258763074875, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4000816270709038, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.39402150362730026, + "step": 1918, + "token_counts/after_target": 527.0, + "token_counts/after_think": 21.5, + "token_counts/before_target": 1909.75, + "token_counts/before_think": 823.25 + }, + { + "avg_penalty/after_target": 1.9227202832698822, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.5985496491193771, + "avg_penalty/before_think": 0.43925710767507553, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 699.25, + "completions/max_terminated_length": 699.25, + "completions/mean_length": 199.140625, + "completions/mean_terminated_length": 199.140625, + "completions/min_length": 50.75, + "completions/min_terminated_length": 50.75, + "epoch": 0.9595, + "grad_norm": 10.818790435791016, + "kl": 18.96875, + "learning_rate": 1.0223769092211012e-07, + "loss": 1.9729, + "num_tokens": 53808104.0, + "reward": 1.6484375, + "reward_std": 0.7192855179309845, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4097762927412987, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.32155390456318855, + "step": 1919, + "token_counts/after_target": 701.0, + "token_counts/after_think": 24.25, + "token_counts/before_target": 1411.75, + "token_counts/before_think": 1049.25 + }, + { + "avg_penalty/after_target": 3.4415067434310913, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.350481279194355, + "avg_penalty/before_think": 0.42092615365982056, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 702.5, + "completions/max_terminated_length": 609.75, + "completions/mean_length": 240.875, + "completions/mean_terminated_length": 228.06771087646484, + "completions/min_length": 38.75, + "completions/min_terminated_length": 38.75, + "epoch": 0.96, + "grad_norm": 8.689793586730957, + "kl": 23.09375, + "learning_rate": 9.976342283442464e-08, + "loss": 2.2055, + "num_tokens": 53833152.0, + "reward": 1.57421875, + "reward_std": 0.7989821434020996, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4361884370446205, + "rewards/tag_count_reward/mean": 0.80859375, + "rewards/tag_count_reward/std": 0.3795235827565193, + "step": 1920, + "token_counts/after_target": 866.5, + "token_counts/after_think": 55.25, + "token_counts/before_target": 1637.75, + "token_counts/before_think": 1294.5 + }, + { + "avg_penalty/after_target": 1.6578784883022308, + "avg_penalty/after_think": 3.7217600345611572, + "avg_penalty/before_target": 0.5402842313051224, + "avg_penalty/before_think": 0.5817411839962006, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 806.75, + "completions/max_terminated_length": 613.75, + "completions/mean_length": 284.921875, + "completions/mean_terminated_length": 261.4729232788086, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.9605, + "grad_norm": 4.4710588455200195, + "kl": 25.796875, + "learning_rate": 9.731931258429638e-08, + "loss": 2.0743, + "num_tokens": 53861675.0, + "reward": 1.3515625, + "reward_std": 0.8508572727441788, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.43655145168304443, + "rewards/tag_count_reward/mean": 0.6953125, + "rewards/tag_count_reward/std": 0.42779491096735, + "step": 1921, + "token_counts/after_target": 907.0, + "token_counts/after_think": 278.0, + "token_counts/before_target": 2253.75, + "token_counts/before_think": 1120.0 + }, + { + "avg_penalty/after_target": 2.7447268962860107, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4360106587409973, + "avg_penalty/before_think": 0.46965277194976807, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 653.75, + "completions/max_terminated_length": 653.75, + "completions/mean_length": 241.625, + "completions/mean_terminated_length": 241.625, + "completions/min_length": 37.75, + "completions/min_terminated_length": 37.75, + "epoch": 0.961, + "grad_norm": 6.672300338745117, + "kl": 21.890625, + "learning_rate": 9.490536761691205e-08, + "loss": 2.0614, + "num_tokens": 53890259.0, + "reward": 1.63671875, + "reward_std": 0.7160296365618706, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.38688503205776215, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.34562020748853683, + "step": 1922, + "token_counts/after_target": 953.5, + "token_counts/after_think": 61.0, + "token_counts/before_target": 1939.5, + "token_counts/before_think": 912.0 + }, + { + "avg_penalty/after_target": 2.538925290107727, + "avg_penalty/after_think": 2.511133372783661, + "avg_penalty/before_target": 0.4994485154747963, + "avg_penalty/before_think": 0.4292112961411476, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.25, + "completions/max_terminated_length": 652.25, + "completions/mean_length": 237.875, + "completions/mean_terminated_length": 237.875, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.9615, + "grad_norm": 3.931875467300415, + "kl": 19.5546875, + "learning_rate": 9.252159528556404e-08, + "loss": 1.6797, + "num_tokens": 53915819.0, + "reward": 1.5390625, + "reward_std": 0.8137963563203812, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.43655145168304443, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.3865860626101494, + "step": 1923, + "token_counts/after_target": 806.5, + "token_counts/after_think": 11.25, + "token_counts/before_target": 1682.0, + "token_counts/before_think": 1306.25 + }, + { + "avg_penalty/after_target": 3.1229792535305023, + "avg_penalty/after_think": 1.8606510162353516, + "avg_penalty/before_target": 0.2769450843334198, + "avg_penalty/before_think": 0.4082479663193226, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.25, + "completions/max_terminated_length": 511.25, + "completions/mean_length": 186.75, + "completions/mean_terminated_length": 186.75, + "completions/min_length": 33.25, + "completions/min_terminated_length": 33.25, + "epoch": 0.962, + "grad_norm": 5.249748229980469, + "kl": 25.296875, + "learning_rate": 9.016800285163718e-08, + "loss": 2.0118, + "num_tokens": 53938475.0, + "reward": 1.453125, + "reward_std": 0.8459894508123398, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4550696536898613, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.39991196244955063, + "step": 1924, + "token_counts/after_target": 416.75, + "token_counts/after_think": 6.25, + "token_counts/before_target": 1755.5, + "token_counts/before_think": 809.5 + }, + { + "avg_penalty/after_target": 2.186769485473633, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.40725933760404587, + "avg_penalty/before_think": 0.383626364171505, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.25, + "completions/max_terminated_length": 636.25, + "completions/mean_length": 205.421875, + "completions/mean_terminated_length": 205.421875, + "completions/min_length": 38.5, + "completions/min_terminated_length": 38.5, + "epoch": 0.9625, + "grad_norm": 4.020220756530762, + "kl": 12.6953125, + "learning_rate": 8.784459748458318e-08, + "loss": 1.2691, + "num_tokens": 53960998.0, + "reward": 1.67578125, + "reward_std": 0.701043426990509, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.3987511098384857, + "rewards/tag_count_reward/mean": 0.86328125, + "rewards/tag_count_reward/std": 0.3155995272099972, + "step": 1925, + "token_counts/after_target": 398.75, + "token_counts/after_think": 204.5, + "token_counts/before_target": 1433.75, + "token_counts/before_think": 1249.75 + }, + { + "avg_penalty/after_target": 2.6744072437286377, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3684578128159046, + "avg_penalty/before_think": 0.48592347651720047, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.25, + "completions/max_terminated_length": 593.25, + "completions/mean_length": 240.4375, + "completions/mean_terminated_length": 240.4375, + "completions/min_length": 50.75, + "completions/min_terminated_length": 50.75, + "epoch": 0.963, + "grad_norm": 7.857656955718994, + "kl": 22.109375, + "learning_rate": 8.555138626189619e-08, + "loss": 1.6775, + "num_tokens": 53985938.0, + "reward": 1.48046875, + "reward_std": 0.8315445333719254, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4471946656703949, + "rewards/tag_count_reward/mean": 0.76171875, + "rewards/tag_count_reward/std": 0.4061720743775368, + "step": 1926, + "token_counts/after_target": 569.25, + "token_counts/after_think": 37.0, + "token_counts/before_target": 1905.5, + "token_counts/before_think": 1335.25 + }, + { + "avg_penalty/after_target": 2.1542753875255585, + "avg_penalty/after_think": 3.6821564435958862, + "avg_penalty/before_target": 0.3531685695052147, + "avg_penalty/before_think": 0.5211904719471931, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.25, + "completions/max_terminated_length": 576.25, + "completions/mean_length": 238.484375, + "completions/mean_terminated_length": 238.484375, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.9635, + "grad_norm": 5.1173319816589355, + "kl": 17.34375, + "learning_rate": 8.328837616909612e-08, + "loss": 1.6433, + "num_tokens": 54010497.0, + "reward": 1.6015625, + "reward_std": 0.6663882657885551, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.41168536990880966, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.28619512915611267, + "step": 1927, + "token_counts/after_target": 614.5, + "token_counts/after_think": 131.5, + "token_counts/before_target": 1783.75, + "token_counts/before_think": 1286.0 + }, + { + "avg_penalty/after_target": 2.068391740322113, + "avg_penalty/after_think": 3.7514583468437195, + "avg_penalty/before_target": 0.43868812918663025, + "avg_penalty/before_think": 0.6937814727425575, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 704.75, + "completions/max_terminated_length": 602.25, + "completions/mean_length": 226.5, + "completions/mean_terminated_length": 214.9322967529297, + "completions/min_length": 40.25, + "completions/min_terminated_length": 40.25, + "epoch": 0.964, + "grad_norm": 7.895651817321777, + "kl": 19.2734375, + "learning_rate": 8.105557409970433e-08, + "loss": 1.8514, + "num_tokens": 54035969.0, + "reward": 1.62109375, + "reward_std": 0.722114235162735, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.3925696536898613, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.32154396176338196, + "step": 1928, + "token_counts/after_target": 726.5, + "token_counts/after_think": 129.25, + "token_counts/before_target": 1787.5, + "token_counts/before_think": 980.75 + }, + { + "avg_penalty/after_target": 2.7114462554454803, + "avg_penalty/after_think": 1.5606905817985535, + "avg_penalty/before_target": 0.509772889316082, + "avg_penalty/before_think": 0.38781963288784027, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 748.75, + "completions/max_terminated_length": 702.25, + "completions/mean_length": 281.890625, + "completions/mean_terminated_length": 271.1843795776367, + "completions/min_length": 57.25, + "completions/min_terminated_length": 57.25, + "epoch": 0.9645, + "grad_norm": 3.974515914916992, + "kl": 26.40625, + "learning_rate": 7.885298685522235e-08, + "loss": 2.2447, + "num_tokens": 54066746.0, + "reward": 1.41015625, + "reward_std": 0.8938752561807632, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.46566852182149887, + "rewards/tag_count_reward/mean": 0.72265625, + "rewards/tag_count_reward/std": 0.4389329180121422, + "step": 1929, + "token_counts/after_target": 1274.0, + "token_counts/after_think": 31.25, + "token_counts/before_target": 2114.0, + "token_counts/before_think": 1091.0 + }, + { + "avg_penalty/after_target": 3.1738380789756775, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.6026639938354492, + "avg_penalty/before_think": 0.4903927147388458, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 842.5, + "completions/max_terminated_length": 825.75, + "completions/mean_length": 271.640625, + "completions/mean_terminated_length": 260.5187530517578, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.965, + "grad_norm": 4.347255706787109, + "kl": 27.75, + "learning_rate": 7.66806211451132e-08, + "loss": 2.4771, + "num_tokens": 54096723.0, + "reward": 1.3984375, + "reward_std": 0.9194280505180359, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4761601909995079, + "rewards/tag_count_reward/mean": 0.7109375, + "rewards/tag_count_reward/std": 0.44752202183008194, + "step": 1930, + "token_counts/after_target": 1255.25, + "token_counts/after_think": 10.0, + "token_counts/before_target": 2261.75, + "token_counts/before_think": 819.25 + }, + { + "avg_penalty/after_target": 2.544979363679886, + "avg_penalty/after_think": 3.58818119764328, + "avg_penalty/before_target": 0.3675740472972393, + "avg_penalty/before_think": 0.5119260922074318, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.75, + "completions/max_terminated_length": 547.75, + "completions/mean_length": 211.609375, + "completions/mean_terminated_length": 211.609375, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.9655, + "grad_norm": 3.151455879211426, + "kl": 19.921875, + "learning_rate": 7.453848358678018e-08, + "loss": 1.6867, + "num_tokens": 54120938.0, + "reward": 1.5390625, + "reward_std": 0.7636328786611557, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4493217319250107, + "rewards/tag_count_reward/mean": 0.8203125, + "rewards/tag_count_reward/std": 0.35113419592380524, + "step": 1931, + "token_counts/after_target": 602.5, + "token_counts/after_think": 34.25, + "token_counts/before_target": 1668.0, + "token_counts/before_think": 1081.0 + }, + { + "avg_penalty/after_target": 2.261781692504883, + "avg_penalty/after_think": 1.652467668056488, + "avg_penalty/before_target": 0.47718704864382744, + "avg_penalty/before_think": 0.5670934617519379, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 693.25, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 208.953125, + "completions/mean_terminated_length": 196.6177101135254, + "completions/min_length": 27.5, + "completions/min_terminated_length": 27.5, + "epoch": 0.966, + "grad_norm": 5.745752811431885, + "kl": 14.78125, + "learning_rate": 7.242658070554465e-08, + "loss": 1.4454, + "num_tokens": 54144263.0, + "reward": 1.703125, + "reward_std": 0.6685112863779068, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.38336414843797684, + "rewards/tag_count_reward/mean": 0.875, + "rewards/tag_count_reward/std": 0.3001937307417393, + "step": 1932, + "token_counts/after_target": 691.5, + "token_counts/after_think": 68.5, + "token_counts/before_target": 1148.0, + "token_counts/before_think": 1435.25 + }, + { + "avg_penalty/after_target": 2.373839408159256, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3193688504397869, + "avg_penalty/before_think": 0.49402307718992233, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.75, + "completions/max_terminated_length": 548.75, + "completions/mean_length": 176.59375, + "completions/mean_terminated_length": 176.59375, + "completions/min_length": 38.5, + "completions/min_terminated_length": 38.5, + "epoch": 0.9665, + "grad_norm": 3.989823579788208, + "kl": 24.0, + "learning_rate": 7.034491893463059e-08, + "loss": 1.9621, + "num_tokens": 54164877.0, + "reward": 1.54296875, + "reward_std": 0.8007821887731552, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.43655145168304443, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.3775385394692421, + "step": 1933, + "token_counts/after_target": 371.5, + "token_counts/after_think": 27.0, + "token_counts/before_target": 1597.0, + "token_counts/before_think": 830.0 + }, + { + "avg_penalty/after_target": 2.622299313545227, + "avg_penalty/after_think": 1.9152826070785522, + "avg_penalty/before_target": 0.3096531741321087, + "avg_penalty/before_think": 0.40986550226807594, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 660.0, + "completions/max_terminated_length": 660.0, + "completions/mean_length": 216.34375, + "completions/mean_terminated_length": 216.34375, + "completions/min_length": 49.75, + "completions/min_terminated_length": 49.75, + "epoch": 0.967, + "grad_norm": 5.1103434562683105, + "kl": 29.6875, + "learning_rate": 6.829350461514007e-08, + "loss": 2.4207, + "num_tokens": 54186803.0, + "reward": 1.3671875, + "reward_std": 0.8288179486989975, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.46875541657209396, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.3902217224240303, + "step": 1934, + "token_counts/after_target": 731.5, + "token_counts/after_think": 106.75, + "token_counts/before_target": 1712.75, + "token_counts/before_think": 910.5 + }, + { + "avg_penalty/after_target": 2.491009920835495, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.40119263157248497, + "avg_penalty/before_think": 0.4765986427664757, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 609.0, + "completions/max_terminated_length": 516.25, + "completions/mean_length": 211.4375, + "completions/mean_terminated_length": 199.55416870117188, + "completions/min_length": 50.75, + "completions/min_terminated_length": 50.75, + "epoch": 0.9675, + "grad_norm": 7.811454772949219, + "kl": 18.921875, + "learning_rate": 6.627234399603554e-08, + "loss": 1.8175, + "num_tokens": 54209599.0, + "reward": 1.59375, + "reward_std": 0.7343224585056305, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.3921433389186859, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.35014135390520096, + "step": 1935, + "token_counts/after_target": 681.5, + "token_counts/after_think": 116.5, + "token_counts/before_target": 1607.0, + "token_counts/before_think": 978.0 + }, + { + "avg_penalty/after_target": 2.2885380387306213, + "avg_penalty/after_think": 1.9456573128700256, + "avg_penalty/before_target": 0.31508003547787666, + "avg_penalty/before_think": 0.4146312549710274, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.5, + "completions/max_terminated_length": 585.5, + "completions/mean_length": 194.75, + "completions/mean_terminated_length": 194.75, + "completions/min_length": 40.25, + "completions/min_terminated_length": 40.25, + "epoch": 0.968, + "grad_norm": 2.5602471828460693, + "kl": 23.640625, + "learning_rate": 6.428144323412544e-08, + "loss": 2.0197, + "num_tokens": 54236191.0, + "reward": 1.578125, + "reward_std": 0.8451828509569168, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.08539126068353653, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4440634250640869, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.381050743162632, + "step": 1936, + "token_counts/after_target": 470.25, + "token_counts/after_think": 146.25, + "token_counts/before_target": 1669.5, + "token_counts/before_think": 830.0 + }, + { + "avg_penalty/after_target": 2.324650079011917, + "avg_penalty/after_think": 3.6904431581497192, + "avg_penalty/before_target": 0.3905121609568596, + "avg_penalty/before_think": 0.6010214686393738, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.25, + "completions/max_terminated_length": 593.25, + "completions/mean_length": 233.734375, + "completions/mean_terminated_length": 233.734375, + "completions/min_length": 42.25, + "completions/min_terminated_length": 42.25, + "epoch": 0.9685, + "grad_norm": 3.4278857707977295, + "kl": 17.625, + "learning_rate": 6.232080839403631e-08, + "loss": 1.5989, + "num_tokens": 54263358.0, + "reward": 1.578125, + "reward_std": 0.7859378159046173, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.43303824216127396, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.36497098207473755, + "step": 1937, + "token_counts/after_target": 567.25, + "token_counts/after_think": 151.0, + "token_counts/before_target": 1978.25, + "token_counts/before_think": 1043.25 + }, + { + "avg_penalty/after_target": 1.8424429893493652, + "avg_penalty/after_think": 3.599396526813507, + "avg_penalty/before_target": 0.42835893481969833, + "avg_penalty/before_think": 0.5154271721839905, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 686.25, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 221.71875, + "completions/mean_terminated_length": 209.5968780517578, + "completions/min_length": 48.5, + "completions/min_terminated_length": 48.5, + "epoch": 0.969, + "grad_norm": 3.1293649673461914, + "kl": 21.34375, + "learning_rate": 6.039044544820404e-08, + "loss": 1.8266, + "num_tokens": 54289388.0, + "reward": 1.69921875, + "reward_std": 0.7023439556360245, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.37149807065725327, + "rewards/tag_count_reward/mean": 0.85546875, + "rewards/tag_count_reward/std": 0.32065530121326447, + "step": 1938, + "token_counts/after_target": 651.25, + "token_counts/after_think": 89.0, + "token_counts/before_target": 1740.5, + "token_counts/before_think": 1066.75 + }, + { + "avg_penalty/after_target": 2.169455111026764, + "avg_penalty/after_think": 3.7080726623535156, + "avg_penalty/before_target": 0.5161036774516106, + "avg_penalty/before_think": 0.5808611735701561, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.25, + "completions/max_terminated_length": 569.25, + "completions/mean_length": 225.875, + "completions/mean_terminated_length": 225.875, + "completions/min_length": 41.25, + "completions/min_terminated_length": 41.25, + "epoch": 0.9695, + "grad_norm": 11.113396644592285, + "kl": 10.234375, + "learning_rate": 5.849036027684607e-08, + "loss": 1.2683, + "num_tokens": 54316356.0, + "reward": 1.77734375, + "reward_std": 0.5686168149113655, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.3454566150903702, + "rewards/tag_count_reward/mean": 0.91796875, + "rewards/tag_count_reward/std": 0.23018774390220642, + "step": 1939, + "token_counts/after_target": 594.25, + "token_counts/after_think": 143.75, + "token_counts/before_target": 1469.75, + "token_counts/before_think": 1406.25 + }, + { + "avg_penalty/after_target": 2.3114370703697205, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4625023752450943, + "avg_penalty/before_think": 0.3899194225668907, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.25, + "completions/max_terminated_length": 590.25, + "completions/mean_length": 229.96875, + "completions/mean_terminated_length": 229.96875, + "completions/min_length": 46.75, + "completions/min_terminated_length": 46.75, + "epoch": 0.97, + "grad_norm": 4.60176944732666, + "kl": 22.53125, + "learning_rate": 5.662055866795357e-08, + "loss": 1.8354, + "num_tokens": 54338066.0, + "reward": 1.45703125, + "reward_std": 0.8368483483791351, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.46566852182149887, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.39850640296936035, + "step": 1940, + "token_counts/after_target": 655.5, + "token_counts/after_think": 34.0, + "token_counts/before_target": 1785.25, + "token_counts/before_think": 1204.75 + }, + { + "avg_penalty/after_target": 2.6090734899044037, + "avg_penalty/after_think": 3.5713083148002625, + "avg_penalty/before_target": 0.43097497522830963, + "avg_penalty/before_think": 0.6358617693185806, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.5, + "completions/max_terminated_length": 574.5, + "completions/mean_length": 242.953125, + "completions/mean_terminated_length": 242.953125, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.9705, + "grad_norm": 4.732800483703613, + "kl": 22.4375, + "learning_rate": 5.4781046317267103e-08, + "loss": 1.9251, + "num_tokens": 54362111.0, + "reward": 1.5, + "reward_std": 0.830204963684082, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.3898652270436287, + "step": 1941, + "token_counts/after_target": 675.75, + "token_counts/after_think": 91.5, + "token_counts/before_target": 1866.75, + "token_counts/before_think": 1253.25 + }, + { + "avg_penalty/after_target": 2.5340230762958527, + "avg_penalty/after_think": 3.941865384578705, + "avg_penalty/before_target": 0.46018432825803757, + "avg_penalty/before_think": 0.45006396621465683, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 670.0, + "completions/max_terminated_length": 573.0, + "completions/mean_length": 252.390625, + "completions/mean_terminated_length": 239.8375015258789, + "completions/min_length": 43.25, + "completions/min_terminated_length": 43.25, + "epoch": 0.971, + "grad_norm": 2.959129810333252, + "kl": 22.421875, + "learning_rate": 5.29718288282588e-08, + "loss": 1.9137, + "num_tokens": 54388040.0, + "reward": 1.53515625, + "reward_std": 0.8139875829219818, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4308478757739067, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.39211585372686386, + "step": 1942, + "token_counts/after_target": 814.5, + "token_counts/after_think": 95.5, + "token_counts/before_target": 1846.0, + "token_counts/before_think": 1282.25 + }, + { + "avg_penalty/after_target": 2.4344946444034576, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3368580527603626, + "avg_penalty/before_think": 0.5480709299445152, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.0, + "completions/max_terminated_length": 544.0, + "completions/mean_length": 202.5, + "completions/mean_terminated_length": 202.5, + "completions/min_length": 39.25, + "completions/min_terminated_length": 39.25, + "epoch": 0.9715, + "grad_norm": 7.728389739990234, + "kl": 17.59375, + "learning_rate": 5.119291171211793e-08, + "loss": 1.7682, + "num_tokens": 54410296.0, + "reward": 1.65625, + "reward_std": 0.6554586887359619, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4075859263539314, + "rewards/tag_count_reward/mean": 0.859375, + "rewards/tag_count_reward/std": 0.2671235427260399, + "step": 1943, + "token_counts/after_target": 609.75, + "token_counts/after_think": 112.75, + "token_counts/before_target": 1515.25, + "token_counts/before_think": 1002.25 + }, + { + "avg_penalty/after_target": 2.467067837715149, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4811958074569702, + "avg_penalty/before_think": 0.4970308095216751, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.75, + "completions/max_terminated_length": 607.75, + "completions/mean_length": 210.59375, + "completions/mean_terminated_length": 210.59375, + "completions/min_length": 42.25, + "completions/min_terminated_length": 42.25, + "epoch": 0.972, + "grad_norm": 12.637757301330566, + "kl": 18.515625, + "learning_rate": 4.944430038773762e-08, + "loss": 1.9515, + "num_tokens": 54434046.0, + "reward": 1.65234375, + "reward_std": 0.6653473526239395, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.41194770485162735, + "rewards/tag_count_reward/mean": 0.87109375, + "rewards/tag_count_reward/std": 0.2713710442185402, + "step": 1944, + "token_counts/after_target": 657.25, + "token_counts/after_think": 311.5, + "token_counts/before_target": 1625.5, + "token_counts/before_think": 775.25 + }, + { + "avg_penalty/after_target": 1.963068574666977, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.45482131838798523, + "avg_penalty/before_think": 0.5508516579866409, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.0, + "completions/max_terminated_length": 565.0, + "completions/mean_length": 192.65625, + "completions/mean_terminated_length": 192.65625, + "completions/min_length": 35.5, + "completions/min_terminated_length": 35.5, + "epoch": 0.9725, + "grad_norm": 3.4250895977020264, + "kl": 16.65625, + "learning_rate": 4.772600018168816e-08, + "loss": 1.5792, + "num_tokens": 54462376.0, + "reward": 1.67578125, + "reward_std": 0.7163465917110443, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.39123913645744324, + "rewards/tag_count_reward/mean": 0.84765625, + "rewards/tag_count_reward/std": 0.3195148929953575, + "step": 1945, + "token_counts/after_target": 420.0, + "token_counts/after_think": 23.25, + "token_counts/before_target": 1823.25, + "token_counts/before_think": 816.0 + }, + { + "avg_penalty/after_target": 2.9179596304893494, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.5357658714056015, + "avg_penalty/before_think": 0.3311321325600147, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 737.25, + "completions/max_terminated_length": 696.5, + "completions/mean_length": 219.15625, + "completions/mean_terminated_length": 206.26354598999023, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.973, + "grad_norm": 11.552209854125977, + "kl": 26.8125, + "learning_rate": 4.603801632821148e-08, + "loss": 2.6657, + "num_tokens": 54484338.0, + "reward": 1.58203125, + "reward_std": 0.7619459331035614, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4229728877544403, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.3563830852508545, + "step": 1946, + "token_counts/after_target": 1180.25, + "token_counts/after_think": 16.5, + "token_counts/before_target": 1328.25, + "token_counts/before_think": 981.5 + }, + { + "avg_penalty/after_target": 2.165843039751053, + "avg_penalty/after_think": 3.8667158484458923, + "avg_penalty/before_target": 0.32242320850491524, + "avg_penalty/before_think": 0.5892290771007538, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.5, + "completions/max_terminated_length": 492.5, + "completions/mean_length": 167.890625, + "completions/mean_terminated_length": 167.890625, + "completions/min_length": 40.5, + "completions/min_terminated_length": 40.5, + "epoch": 0.9735, + "grad_norm": 4.6466264724731445, + "kl": 13.0390625, + "learning_rate": 4.438035396920004e-08, + "loss": 1.3398, + "num_tokens": 54508699.0, + "reward": 1.734375, + "reward_std": 0.6838320940732956, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.3604728877544403, + "rewards/tag_count_reward/mean": 0.875, + "rewards/tag_count_reward/std": 0.3134319335222244, + "step": 1947, + "token_counts/after_target": 356.75, + "token_counts/after_think": 92.5, + "token_counts/before_target": 1393.75, + "token_counts/before_think": 843.25 + }, + { + "avg_penalty/after_target": 2.085276871919632, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4066324457526207, + "avg_penalty/before_think": 0.48206590861082077, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 658.0, + "completions/max_terminated_length": 658.0, + "completions/mean_length": 243.296875, + "completions/mean_terminated_length": 243.296875, + "completions/min_length": 41.25, + "completions/min_terminated_length": 41.25, + "epoch": 0.974, + "grad_norm": 4.593237400054932, + "kl": 16.78125, + "learning_rate": 4.275301815417909e-08, + "loss": 1.5966, + "num_tokens": 54533182.0, + "reward": 1.60546875, + "reward_std": 0.7229655086994171, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4079566150903702, + "rewards/tag_count_reward/mean": 0.83984375, + "rewards/tag_count_reward/std": 0.3381194621324539, + "step": 1948, + "token_counts/after_target": 573.25, + "token_counts/after_think": 99.5, + "token_counts/before_target": 2188.75, + "token_counts/before_think": 1031.25 + }, + { + "avg_penalty/after_target": 2.473546326160431, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.386396124958992, + "avg_penalty/before_think": 0.5816602185368538, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 643.0, + "completions/max_terminated_length": 643.0, + "completions/mean_length": 219.703125, + "completions/mean_terminated_length": 219.703125, + "completions/min_length": 59.25, + "completions/min_terminated_length": 59.25, + "epoch": 0.9745, + "grad_norm": 4.447041988372803, + "kl": 22.671875, + "learning_rate": 4.115601384029666e-08, + "loss": 1.9797, + "num_tokens": 54555483.0, + "reward": 1.5078125, + "reward_std": 0.8238567560911179, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44938503205776215, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.3855567052960396, + "step": 1949, + "token_counts/after_target": 600.75, + "token_counts/after_think": 180.0, + "token_counts/before_target": 2005.75, + "token_counts/before_think": 728.75 + }, + { + "avg_penalty/after_target": 2.586057037115097, + "avg_penalty/after_think": 2.7463266849517822, + "avg_penalty/before_target": 0.448380246758461, + "avg_penalty/before_think": 0.40147778019309044, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.75, + "completions/max_terminated_length": 431.75, + "completions/mean_length": 165.90625, + "completions/mean_terminated_length": 165.90625, + "completions/min_length": 41.5, + "completions/min_terminated_length": 41.5, + "epoch": 0.975, + "grad_norm": 12.392317771911621, + "kl": 17.140625, + "learning_rate": 3.9589345892304673e-08, + "loss": 1.8754, + "num_tokens": 54576389.0, + "reward": 1.59375, + "reward_std": 0.7653421610593796, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4229728877544403, + "rewards/tag_count_reward/mean": 0.828125, + "rewards/tag_count_reward/std": 0.3664432466030121, + "step": 1950, + "token_counts/after_target": 454.0, + "token_counts/after_think": 244.5, + "token_counts/before_target": 1016.5, + "token_counts/before_think": 939.5 + }, + { + "avg_penalty/after_target": 1.5707563161849976, + "avg_penalty/after_think": 3.1997175216674805, + "avg_penalty/before_target": 0.3139871396124363, + "avg_penalty/before_think": 0.39395246654748917, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.0, + "completions/max_terminated_length": 577.0, + "completions/mean_length": 191.953125, + "completions/mean_terminated_length": 191.953125, + "completions/min_length": 38.5, + "completions/min_terminated_length": 38.5, + "epoch": 0.9755, + "grad_norm": 6.1691155433654785, + "kl": 13.0625, + "learning_rate": 3.805301908254455e-08, + "loss": 1.0253, + "num_tokens": 54601282.0, + "reward": 1.74609375, + "reward_std": 0.5741234570741653, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.3529609143733978, + "rewards/tag_count_reward/mean": 0.90234375, + "rewards/tag_count_reward/std": 0.23505917936563492, + "step": 1951, + "token_counts/after_target": 206.5, + "token_counts/after_think": 22.5, + "token_counts/before_target": 1626.0, + "token_counts/before_think": 1216.25 + }, + { + "avg_penalty/after_target": 2.8164286613464355, + "avg_penalty/after_think": 2.7963039875030518, + "avg_penalty/before_target": 0.7210658937692642, + "avg_penalty/before_think": 0.7863619774580002, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 842.25, + "completions/max_terminated_length": 842.25, + "completions/mean_length": 278.75, + "completions/mean_terminated_length": 278.75, + "completions/min_length": 40.25, + "completions/min_terminated_length": 40.25, + "epoch": 0.976, + "grad_norm": 11.029969215393066, + "kl": 27.4921875, + "learning_rate": 3.654703809093607e-08, + "loss": 2.6149, + "num_tokens": 54628194.0, + "reward": 1.3671875, + "reward_std": 0.8438520580530167, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.65625, + "rewards/format_reward/std": 0.4622559919953346, + "rewards/tag_count_reward/mean": 0.7109375, + "rewards/tag_count_reward/std": 0.3943040408194065, + "step": 1952, + "token_counts/after_target": 1638.5, + "token_counts/after_think": 67.5, + "token_counts/before_target": 1932.5, + "token_counts/before_think": 821.5 + }, + { + "avg_penalty/after_target": 2.7654476165771484, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.42266885563731194, + "avg_penalty/before_think": 0.4837227761745453, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 629.25, + "completions/max_terminated_length": 629.25, + "completions/mean_length": 229.984375, + "completions/mean_terminated_length": 229.984375, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.9765, + "grad_norm": 4.063337326049805, + "kl": 20.140625, + "learning_rate": 3.50714075049563e-08, + "loss": 1.6043, + "num_tokens": 54656225.0, + "reward": 1.44921875, + "reward_std": 0.8549297004938126, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4613594636321068, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.4133722335100174, + "step": 1953, + "token_counts/after_target": 871.5, + "token_counts/after_think": 37.0, + "token_counts/before_target": 1858.0, + "token_counts/before_think": 913.25 + }, + { + "avg_penalty/after_target": 1.7493039667606354, + "avg_penalty/after_think": 3.8425007462501526, + "avg_penalty/before_target": 0.42015865817666054, + "avg_penalty/before_think": 0.4900035411119461, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 218.265625, + "completions/mean_terminated_length": 218.265625, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.977, + "grad_norm": 4.295379161834717, + "kl": 17.9375, + "learning_rate": 3.362613181963404e-08, + "loss": 1.4602, + "num_tokens": 54686802.0, + "reward": 1.5625, + "reward_std": 0.8093910366296768, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4519384130835533, + "rewards/tag_count_reward/mean": 0.8125, + "rewards/tag_count_reward/std": 0.36966951191425323, + "step": 1954, + "token_counts/after_target": 535.0, + "token_counts/after_think": 46.0, + "token_counts/before_target": 1680.75, + "token_counts/before_think": 1230.5 + }, + { + "avg_penalty/after_target": 2.070436865091324, + "avg_penalty/after_think": 2.847874879837036, + "avg_penalty/before_target": 0.5217357315123081, + "avg_penalty/before_think": 0.45659879595041275, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 715.75, + "completions/max_terminated_length": 715.75, + "completions/mean_length": 219.4375, + "completions/mean_terminated_length": 219.4375, + "completions/min_length": 48.25, + "completions/min_terminated_length": 48.25, + "epoch": 0.9775, + "grad_norm": 4.411993026733398, + "kl": 27.09375, + "learning_rate": 3.22112154375287e-08, + "loss": 2.2067, + "num_tokens": 54712174.0, + "reward": 1.44921875, + "reward_std": 0.8752775490283966, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4625816270709038, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.42650987952947617, + "step": 1955, + "token_counts/after_target": 755.5, + "token_counts/after_think": 57.75, + "token_counts/before_target": 1829.0, + "token_counts/before_think": 868.75 + }, + { + "avg_penalty/after_target": 3.119325876235962, + "avg_penalty/after_think": 1.0, + "avg_penalty/before_target": 0.3550833500921726, + "avg_penalty/before_think": 0.5417724773287773, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.25, + "completions/max_terminated_length": 446.25, + "completions/mean_length": 196.84375, + "completions/mean_terminated_length": 196.84375, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.978, + "grad_norm": 5.427331447601318, + "kl": 20.0703125, + "learning_rate": 3.082666266872036e-08, + "loss": 1.8669, + "num_tokens": 54736580.0, + "reward": 1.51171875, + "reward_std": 0.769904300570488, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45247192680835724, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.3509277105331421, + "step": 1956, + "token_counts/after_target": 783.5, + "token_counts/after_think": 53.0, + "token_counts/before_target": 1631.75, + "token_counts/before_think": 681.25 + }, + { + "avg_penalty/after_target": 2.3553190529346466, + "avg_penalty/after_think": 1.6326453685760498, + "avg_penalty/before_target": 0.3220496401190758, + "avg_penalty/before_think": 0.30341238901019096, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.5, + "completions/max_terminated_length": 577.5, + "completions/mean_length": 193.296875, + "completions/mean_terminated_length": 193.296875, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.9785, + "grad_norm": 9.347453117370605, + "kl": 28.0, + "learning_rate": 2.947247773079753e-08, + "loss": 2.0623, + "num_tokens": 54760935.0, + "reward": 1.39453125, + "reward_std": 0.892426997423172, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4613594636321068, + "rewards/tag_count_reward/mean": 0.70703125, + "rewards/tag_count_reward/std": 0.4385659247636795, + "step": 1957, + "token_counts/after_target": 560.25, + "token_counts/after_think": 9.0, + "token_counts/before_target": 2018.75, + "token_counts/before_think": 504.75 + }, + { + "avg_penalty/after_target": 2.782366693019867, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.48492884635925293, + "avg_penalty/before_think": 0.46176736801862717, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 616.5, + "completions/max_terminated_length": 499.25, + "completions/mean_length": 184.359375, + "completions/mean_terminated_length": 170.88854217529297, + "completions/min_length": 38.25, + "completions/min_terminated_length": 38.25, + "epoch": 0.979, + "grad_norm": 9.08404541015625, + "kl": 21.84375, + "learning_rate": 2.8148664748842702e-08, + "loss": 2.0046, + "num_tokens": 54787902.0, + "reward": 1.5078125, + "reward_std": 0.8382412046194077, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.4493217319250107, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.383667454123497, + "step": 1958, + "token_counts/after_target": 676.25, + "token_counts/after_think": 37.5, + "token_counts/before_target": 1462.75, + "token_counts/before_think": 773.25 + }, + { + "avg_penalty/after_target": 3.2212742269039154, + "avg_penalty/after_think": 3.217980772256851, + "avg_penalty/before_target": 0.3287205770611763, + "avg_penalty/before_think": 0.4297475889325142, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.75, + "completions/max_terminated_length": 460.75, + "completions/mean_length": 155.390625, + "completions/mean_terminated_length": 155.390625, + "completions/min_length": 35.75, + "completions/min_terminated_length": 35.75, + "epoch": 0.9795, + "grad_norm": 5.598674774169922, + "kl": 21.0625, + "learning_rate": 2.6855227755419046e-08, + "loss": 1.9882, + "num_tokens": 54807399.0, + "reward": 1.54296875, + "reward_std": 0.7988941520452499, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4260597825050354, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.38518884778022766, + "step": 1959, + "token_counts/after_target": 622.75, + "token_counts/after_think": 30.5, + "token_counts/before_target": 1317.5, + "token_counts/before_think": 515.5 + }, + { + "avg_penalty/after_target": 2.4866623878479004, + "avg_penalty/after_think": 2.58005553483963, + "avg_penalty/before_target": 0.3531796485185623, + "avg_penalty/before_think": 0.3991272449493408, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.25, + "completions/max_terminated_length": 440.25, + "completions/mean_length": 163.5625, + "completions/mean_terminated_length": 163.5625, + "completions/min_length": 29.75, + "completions/min_terminated_length": 29.75, + "epoch": 0.98, + "grad_norm": 4.447474002838135, + "kl": 18.296875, + "learning_rate": 2.5592170690560415e-08, + "loss": 1.4322, + "num_tokens": 54828139.0, + "reward": 1.5703125, + "reward_std": 0.7888603955507278, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4000816270709038, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.3897155672311783, + "step": 1960, + "token_counts/after_target": 376.25, + "token_counts/after_think": 19.5, + "token_counts/before_target": 1268.75, + "token_counts/before_think": 952.5 + }, + { + "avg_penalty/after_target": 2.330598384141922, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4419409930706024, + "avg_penalty/before_think": 0.5119412429630756, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 717.5, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 209.625, + "completions/mean_terminated_length": 184.48542404174805, + "completions/min_length": 36.25, + "completions/min_terminated_length": 36.25, + "epoch": 0.9805, + "grad_norm": 6.851729393005371, + "kl": 23.3515625, + "learning_rate": 2.4359497401758026e-08, + "loss": 1.7983, + "num_tokens": 54852259.0, + "reward": 1.421875, + "reward_std": 0.8623217791318893, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.671875, + "rewards/format_reward/std": 0.4757782220840454, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.4055408835411072, + "step": 1961, + "token_counts/after_target": 762.25, + "token_counts/after_think": 27.5, + "token_counts/before_target": 1874.0, + "token_counts/before_think": 690.25 + }, + { + "avg_penalty/after_target": 2.127927213907242, + "avg_penalty/after_think": 3.5852235555648804, + "avg_penalty/before_target": 0.3524019345641136, + "avg_penalty/before_think": 0.46943625807762146, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 633.5, + "completions/max_terminated_length": 609.25, + "completions/mean_length": 217.359375, + "completions/mean_terminated_length": 207.17188262939453, + "completions/min_length": 59.75, + "completions/min_terminated_length": 59.75, + "epoch": 0.981, + "grad_norm": 7.793992519378662, + "kl": 14.09375, + "learning_rate": 2.315721164394713e-08, + "loss": 1.4596, + "num_tokens": 54874474.0, + "reward": 1.6640625, + "reward_std": 0.6934709697961807, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.39656074345111847, + "rewards/tag_count_reward/mean": 0.8671875, + "rewards/tag_count_reward/std": 0.31867852061986923, + "step": 1962, + "token_counts/after_target": 465.5, + "token_counts/after_think": 144.5, + "token_counts/before_target": 1792.25, + "token_counts/before_think": 1075.5 + }, + { + "avg_penalty/after_target": 2.498616099357605, + "avg_penalty/after_think": 2.9621017575263977, + "avg_penalty/before_target": 0.39368996024131775, + "avg_penalty/before_think": 0.46209781244397163, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 667.0, + "completions/max_terminated_length": 603.75, + "completions/mean_length": 206.0, + "completions/mean_terminated_length": 194.2291717529297, + "completions/min_length": 53.75, + "completions/min_terminated_length": 53.75, + "epoch": 0.9815, + "grad_norm": 9.238115310668945, + "kl": 23.515625, + "learning_rate": 2.1985317079500358e-08, + "loss": 2.2637, + "num_tokens": 54898410.0, + "reward": 1.5078125, + "reward_std": 0.8039236143231392, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44187305867671967, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.37997738271951675, + "step": 1963, + "token_counts/after_target": 660.25, + "token_counts/after_think": 239.5, + "token_counts/before_target": 1720.0, + "token_counts/before_think": 676.25 + }, + { + "avg_penalty/after_target": 3.164923310279846, + "avg_penalty/after_think": 3.1870248913764954, + "avg_penalty/before_target": 0.4328061565756798, + "avg_penalty/before_think": 0.5552197247743607, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.25, + "completions/max_terminated_length": 551.25, + "completions/mean_length": 230.125, + "completions/mean_terminated_length": 230.125, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.982, + "grad_norm": 9.500210762023926, + "kl": 17.5234375, + "learning_rate": 2.0843817278209943e-08, + "loss": 1.7723, + "num_tokens": 54921282.0, + "reward": 1.58203125, + "reward_std": 0.6924204379320145, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.37675637751817703, + "rewards/tag_count_reward/mean": 0.81640625, + "rewards/tag_count_reward/std": 0.33455636352300644, + "step": 1964, + "token_counts/after_target": 882.75, + "token_counts/after_think": 55.5, + "token_counts/before_target": 1485.0, + "token_counts/before_think": 1258.75 + }, + { + "avg_penalty/after_target": 1.9255179166793823, + "avg_penalty/after_think": 3.9425346851348877, + "avg_penalty/before_target": 0.3727872632443905, + "avg_penalty/before_think": 0.6911629289388657, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.25, + "completions/max_terminated_length": 636.25, + "completions/mean_length": 241.578125, + "completions/mean_terminated_length": 241.578125, + "completions/min_length": 34.5, + "completions/min_terminated_length": 34.5, + "epoch": 0.9825, + "grad_norm": 4.129374027252197, + "kl": 20.65625, + "learning_rate": 1.973271571728441e-08, + "loss": 1.7887, + "num_tokens": 54947207.0, + "reward": 1.4296875, + "reward_std": 0.8868145197629929, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.10077822208404541, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.47987766563892365, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.39427489042282104, + "step": 1965, + "token_counts/after_target": 580.5, + "token_counts/after_think": 333.0, + "token_counts/before_target": 1853.75, + "token_counts/before_think": 1098.0 + }, + { + "avg_penalty/after_target": 2.5538398921489716, + "avg_penalty/after_think": 3.7199472188949585, + "avg_penalty/before_target": 0.30249808728694916, + "avg_penalty/before_think": 0.579340435564518, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.25, + "completions/max_terminated_length": 537.25, + "completions/mean_length": 217.796875, + "completions/mean_terminated_length": 217.796875, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.983, + "grad_norm": 6.194338798522949, + "kl": 18.015625, + "learning_rate": 1.86520157813308e-08, + "loss": 1.3961, + "num_tokens": 54974794.0, + "reward": 1.51953125, + "reward_std": 0.814767062664032, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.46296359598636627, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.3812340497970581, + "step": 1966, + "token_counts/after_target": 342.25, + "token_counts/after_think": 43.5, + "token_counts/before_target": 1912.5, + "token_counts/before_think": 1186.5 + }, + { + "avg_penalty/after_target": 2.4221548438072205, + "avg_penalty/after_think": 2.4368338584899902, + "avg_penalty/before_target": 0.3391479514539242, + "avg_penalty/before_think": 0.47183743119239807, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 658.5, + "completions/max_terminated_length": 658.5, + "completions/mean_length": 225.296875, + "completions/mean_terminated_length": 225.296875, + "completions/min_length": 46.75, + "completions/min_terminated_length": 46.75, + "epoch": 0.9835, + "grad_norm": 7.811206817626953, + "kl": 18.8125, + "learning_rate": 1.7601720762346895e-08, + "loss": 1.8605, + "num_tokens": 55000253.0, + "reward": 1.7421875, + "reward_std": 0.6456765383481979, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.34944770485162735, + "rewards/tag_count_reward/mean": 0.8828125, + "rewards/tag_count_reward/std": 0.30516182631254196, + "step": 1967, + "token_counts/after_target": 740.0, + "token_counts/after_think": 38.75, + "token_counts/before_target": 1591.75, + "token_counts/before_think": 1234.25 + }, + { + "avg_penalty/after_target": 1.7027766406536102, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4190647080540657, + "avg_penalty/before_think": 0.6025677770376205, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 799.25, + "completions/max_terminated_length": 799.25, + "completions/mean_length": 238.4375, + "completions/mean_terminated_length": 238.4375, + "completions/min_length": 43.75, + "completions/min_terminated_length": 43.75, + "epoch": 0.984, + "grad_norm": 4.501383304595947, + "kl": 20.4375, + "learning_rate": 1.6581833859716788e-08, + "loss": 1.7025, + "num_tokens": 55024761.0, + "reward": 1.62890625, + "reward_std": 0.7662186771631241, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.374237596988678, + "step": 1968, + "token_counts/after_target": 533.75, + "token_counts/after_think": 152.5, + "token_counts/before_target": 1921.25, + "token_counts/before_think": 1207.5 + }, + { + "avg_penalty/after_target": 2.4451848566532135, + "avg_penalty/after_think": 3.711335062980652, + "avg_penalty/before_target": 0.3431051969528198, + "avg_penalty/before_think": 0.7930124551057816, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.25, + "completions/max_terminated_length": 521.25, + "completions/mean_length": 195.71875, + "completions/mean_terminated_length": 195.71875, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.9845, + "grad_norm": 10.825963973999023, + "kl": 11.6171875, + "learning_rate": 1.5592358180189782e-08, + "loss": 1.3668, + "num_tokens": 55046631.0, + "reward": 1.6484375, + "reward_std": 0.7247003316879272, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.4066260978579521, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.33905013650655746, + "step": 1969, + "token_counts/after_target": 537.5, + "token_counts/after_think": 172.0, + "token_counts/before_target": 1307.5, + "token_counts/before_think": 1114.5 + }, + { + "avg_penalty/after_target": 1.9946212470531464, + "avg_penalty/after_think": 3.725666582584381, + "avg_penalty/before_target": 0.34427575021982193, + "avg_penalty/before_think": 0.46199066936969757, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 711.5, + "completions/max_terminated_length": 711.5, + "completions/mean_length": 230.578125, + "completions/mean_terminated_length": 230.578125, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.985, + "grad_norm": 2.815075635910034, + "kl": 25.21875, + "learning_rate": 1.4633296737882607e-08, + "loss": 2.1301, + "num_tokens": 55073724.0, + "reward": 1.45703125, + "reward_std": 0.8711531162261963, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4682852029800415, + "rewards/tag_count_reward/mean": 0.75390625, + "rewards/tag_count_reward/std": 0.4155847653746605, + "step": 1970, + "token_counts/after_target": 611.0, + "token_counts/after_think": 113.0, + "token_counts/before_target": 2062.5, + "token_counts/before_think": 902.75 + }, + { + "avg_penalty/after_target": 2.6343377232551575, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.3816208057105541, + "avg_penalty/before_think": 0.5826556533575058, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.25, + "completions/max_terminated_length": 579.25, + "completions/mean_length": 234.640625, + "completions/mean_terminated_length": 234.640625, + "completions/min_length": 36.75, + "completions/min_terminated_length": 36.75, + "epoch": 0.9855, + "grad_norm": 2.813584566116333, + "kl": 19.984375, + "learning_rate": 1.370465245426167e-08, + "loss": 1.6771, + "num_tokens": 55097765.0, + "reward": 1.51171875, + "reward_std": 0.8086449950933456, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.79296875, + "rewards/tag_count_reward/std": 0.38221607357263565, + "step": 1971, + "token_counts/after_target": 629.5, + "token_counts/after_think": 188.5, + "token_counts/before_target": 1794.25, + "token_counts/before_think": 1142.0 + }, + { + "avg_penalty/after_target": 2.2254863679409027, + "avg_penalty/after_think": 3.7779484391212463, + "avg_penalty/before_target": 0.335302259773016, + "avg_penalty/before_think": 0.40283817797899246, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 617.25, + "completions/max_terminated_length": 513.25, + "completions/mean_length": 209.765625, + "completions/mean_terminated_length": 198.10104370117188, + "completions/min_length": 38.5, + "completions/min_terminated_length": 38.5, + "epoch": 0.986, + "grad_norm": 3.869821548461914, + "kl": 18.59375, + "learning_rate": 1.2806428158138596e-08, + "loss": 1.629, + "num_tokens": 55121446.0, + "reward": 1.60546875, + "reward_std": 0.7827922403812408, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4255262687802315, + "rewards/tag_count_reward/mean": 0.82421875, + "rewards/tag_count_reward/std": 0.3531993180513382, + "step": 1972, + "token_counts/after_target": 494.0, + "token_counts/after_think": 144.5, + "token_counts/before_target": 1952.25, + "token_counts/before_think": 765.5 + }, + { + "avg_penalty/after_target": 1.860745906829834, + "avg_penalty/after_think": 2.8991329073905945, + "avg_penalty/before_target": 0.4396083429455757, + "avg_penalty/before_think": 0.45483700186014175, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 666.5, + "completions/max_terminated_length": 666.5, + "completions/mean_length": 254.0625, + "completions/mean_terminated_length": 254.0625, + "completions/min_length": 58.25, + "completions/min_terminated_length": 58.25, + "epoch": 0.9865, + "grad_norm": 3.0443036556243896, + "kl": 18.125, + "learning_rate": 1.1938626585660252e-08, + "loss": 1.6246, + "num_tokens": 55147466.0, + "reward": 1.71875, + "reward_std": 0.6779657751321793, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.3723389655351639, + "rewards/tag_count_reward/mean": 0.875, + "rewards/tag_count_reward/std": 0.3128286600112915, + "step": 1973, + "token_counts/after_target": 576.0, + "token_counts/after_think": 116.75, + "token_counts/before_target": 1882.0, + "token_counts/before_think": 1490.25 + }, + { + "avg_penalty/after_target": 2.3590280413627625, + "avg_penalty/after_think": 3.5077885389328003, + "avg_penalty/before_target": 0.5044506192207336, + "avg_penalty/before_think": 0.36254744231700897, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 704.75, + "completions/max_terminated_length": 612.0, + "completions/mean_length": 225.8125, + "completions/mean_terminated_length": 214.05937957763672, + "completions/min_length": 42.5, + "completions/min_terminated_length": 42.5, + "epoch": 0.987, + "grad_norm": 15.742496490478516, + "kl": 15.609375, + "learning_rate": 1.1101250380300965e-08, + "loss": 1.919, + "num_tokens": 55171246.0, + "reward": 1.70703125, + "reward_std": 0.6970252692699432, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.3683478757739067, + "rewards/tag_count_reward/mean": 0.86328125, + "rewards/tag_count_reward/std": 0.3173222169280052, + "step": 1974, + "token_counts/after_target": 900.25, + "token_counts/after_think": 94.0, + "token_counts/before_target": 1746.0, + "token_counts/before_think": 872.75 + }, + { + "avg_penalty/after_target": 2.89713317155838, + "avg_penalty/after_think": 1.5785169005393982, + "avg_penalty/before_target": 0.4210687503218651, + "avg_penalty/before_think": 0.5911815464496613, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 649.25, + "completions/max_terminated_length": 649.25, + "completions/mean_length": 232.3125, + "completions/mean_terminated_length": 232.3125, + "completions/min_length": 37.75, + "completions/min_terminated_length": 37.75, + "epoch": 0.9875, + "grad_norm": 10.815048217773438, + "kl": 19.859375, + "learning_rate": 1.0294302092853647e-08, + "loss": 2.0127, + "num_tokens": 55199106.0, + "reward": 1.5, + "reward_std": 0.8053969740867615, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4682852029800415, + "rewards/tag_count_reward/mean": 0.796875, + "rewards/tag_count_reward/std": 0.3667605370283127, + "step": 1975, + "token_counts/after_target": 1179.5, + "token_counts/after_think": 32.5, + "token_counts/before_target": 1513.0, + "token_counts/before_think": 992.0 + }, + { + "avg_penalty/after_target": 2.221786081790924, + "avg_penalty/after_think": 2.6663074493408203, + "avg_penalty/before_target": 0.3926040157675743, + "avg_penalty/before_think": 0.5234757363796234, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.75, + "completions/max_terminated_length": 607.75, + "completions/mean_length": 230.046875, + "completions/mean_terminated_length": 230.046875, + "completions/min_length": 60.5, + "completions/min_terminated_length": 60.5, + "epoch": 0.988, + "grad_norm": 3.0132181644439697, + "kl": 21.390625, + "learning_rate": 9.517784181422018e-09, + "loss": 1.8197, + "num_tokens": 55223669.0, + "reward": 1.484375, + "reward_std": 0.7998781055212021, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4534844756126404, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.36803436279296875, + "step": 1976, + "token_counts/after_target": 719.5, + "token_counts/after_think": 66.25, + "token_counts/before_target": 2003.75, + "token_counts/before_think": 891.25 + }, + { + "avg_penalty/after_target": 2.2999109029769897, + "avg_penalty/after_think": 1.6668102145195007, + "avg_penalty/before_target": 0.35048196092247963, + "avg_penalty/before_think": 0.5745276883244514, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.0, + "completions/max_terminated_length": 567.0, + "completions/mean_length": 222.6875, + "completions/mean_terminated_length": 222.6875, + "completions/min_length": 47.75, + "completions/min_terminated_length": 47.75, + "epoch": 0.9885, + "grad_norm": 4.483514308929443, + "kl": 22.671875, + "learning_rate": 8.771699011416169e-09, + "loss": 1.9078, + "num_tokens": 55248801.0, + "reward": 1.484375, + "reward_std": 0.7995489984750748, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.42739029973745346, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.38109902292490005, + "step": 1977, + "token_counts/after_target": 704.75, + "token_counts/after_think": 23.5, + "token_counts/before_target": 1892.75, + "token_counts/before_think": 942.0 + }, + { + "avg_penalty/after_target": 2.658359110355377, + "avg_penalty/after_think": 2.940125584602356, + "avg_penalty/before_target": 0.3512357510626316, + "avg_penalty/before_think": 0.45400047302246094, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 560.25, + "completions/max_terminated_length": 510.5, + "completions/mean_length": 205.25, + "completions/mean_terminated_length": 192.86042022705078, + "completions/min_length": 22.75, + "completions/min_terminated_length": 22.75, + "epoch": 0.989, + "grad_norm": 6.282118797302246, + "kl": 22.375, + "learning_rate": 8.056048855540344e-09, + "loss": 1.7453, + "num_tokens": 55275073.0, + "reward": 1.43359375, + "reward_std": 0.8521295785903931, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.46450965851545334, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.40357915312051773, + "step": 1978, + "token_counts/after_target": 692.75, + "token_counts/after_think": 67.0, + "token_counts/before_target": 1414.75, + "token_counts/before_think": 1109.5 + }, + { + "avg_penalty/after_target": 1.8283996284008026, + "avg_penalty/after_think": 3.6227007508277893, + "avg_penalty/before_target": 0.3844569995999336, + "avg_penalty/before_think": 0.469160795211792, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.25, + "completions/max_terminated_length": 601.25, + "completions/mean_length": 212.640625, + "completions/mean_terminated_length": 212.640625, + "completions/min_length": 36.5, + "completions/min_terminated_length": 36.5, + "epoch": 0.9895, + "grad_norm": 8.967345237731934, + "kl": 25.34375, + "learning_rate": 7.370835893788508e-09, + "loss": 1.8803, + "num_tokens": 55302826.0, + "reward": 1.46875, + "reward_std": 0.8297800123691559, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4459725022315979, + "rewards/tag_count_reward/mean": 0.765625, + "rewards/tag_count_reward/std": 0.4059264361858368, + "step": 1979, + "token_counts/after_target": 437.25, + "token_counts/after_think": 27.0, + "token_counts/before_target": 2265.25, + "token_counts/before_think": 672.75 + }, + { + "avg_penalty/after_target": 3.126462399959564, + "avg_penalty/after_think": 2.9738404154777527, + "avg_penalty/before_target": 0.4607028216123581, + "avg_penalty/before_think": 0.565221942961216, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.25, + "completions/max_terminated_length": 488.25, + "completions/mean_length": 174.71875, + "completions/mean_terminated_length": 174.71875, + "completions/min_length": 48.25, + "completions/min_terminated_length": 48.25, + "epoch": 0.99, + "grad_norm": 11.835445404052734, + "kl": 15.875, + "learning_rate": 6.716062213437679e-09, + "loss": 1.787, + "num_tokens": 55323752.0, + "reward": 1.73828125, + "reward_std": 0.6275716722011566, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.84375, + "rewards/format_reward/std": 0.3604728877544403, + "rewards/tag_count_reward/mean": 0.89453125, + "rewards/tag_count_reward/std": 0.29178136587142944, + "step": 1980, + "token_counts/after_target": 625.25, + "token_counts/after_think": 20.0, + "token_counts/before_target": 1071.75, + "token_counts/before_think": 1078.5 + }, + { + "avg_penalty/after_target": 2.269071340560913, + "avg_penalty/after_think": 2.4777868390083313, + "avg_penalty/before_target": 0.4310641810297966, + "avg_penalty/before_think": 0.4513263925909996, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 676.25, + "completions/max_terminated_length": 676.25, + "completions/mean_length": 231.421875, + "completions/mean_terminated_length": 231.421875, + "completions/min_length": 37.75, + "completions/min_terminated_length": 37.75, + "epoch": 0.9905, + "grad_norm": 3.8883233070373535, + "kl": 22.40625, + "learning_rate": 6.091729809042379e-09, + "loss": 1.8809, + "num_tokens": 55348851.0, + "reward": 1.5234375, + "reward_std": 0.803899273276329, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.42707233130931854, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.3984595686197281, + "step": 1981, + "token_counts/after_target": 645.5, + "token_counts/after_think": 46.0, + "token_counts/before_target": 1634.5, + "token_counts/before_think": 1376.75 + }, + { + "avg_penalty/after_target": 2.3104192912578583, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4425305761396885, + "avg_penalty/before_think": 0.6133643239736557, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 633.75, + "completions/max_terminated_length": 584.75, + "completions/mean_length": 215.390625, + "completions/mean_terminated_length": 202.71667098999023, + "completions/min_length": 40.75, + "completions/min_terminated_length": 40.75, + "epoch": 0.991, + "grad_norm": 21.127883911132812, + "kl": 27.09375, + "learning_rate": 5.497840582429082e-09, + "loss": 2.1375, + "num_tokens": 55375324.0, + "reward": 1.35546875, + "reward_std": 0.8931576758623123, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.640625, + "rewards/format_reward/std": 0.48813462257385254, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.4283505603671074, + "step": 1982, + "token_counts/after_target": 724.25, + "token_counts/after_think": 71.25, + "token_counts/before_target": 1818.0, + "token_counts/before_think": 832.75 + }, + { + "avg_penalty/after_target": 2.4321957528591156, + "avg_penalty/after_think": 3.5359649062156677, + "avg_penalty/before_target": 0.4489850215613842, + "avg_penalty/before_think": 0.5694205388426781, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 744.75, + "completions/max_terminated_length": 721.0, + "completions/mean_length": 250.125, + "completions/mean_terminated_length": 238.3697967529297, + "completions/min_length": 38.5, + "completions/min_terminated_length": 38.5, + "epoch": 0.9915, + "grad_norm": 5.254440784454346, + "kl": 24.296875, + "learning_rate": 4.9343963426840006e-09, + "loss": 1.9596, + "num_tokens": 55402020.0, + "reward": 1.51171875, + "reward_std": 0.8249835222959518, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44187305867671967, + "rewards/tag_count_reward/mean": 0.77734375, + "rewards/tag_count_reward/std": 0.3970467522740364, + "step": 1983, + "token_counts/after_target": 723.25, + "token_counts/after_think": 176.0, + "token_counts/before_target": 2046.5, + "token_counts/before_think": 1056.25 + }, + { + "avg_penalty/after_target": 3.0033246278762817, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.4025672674179077, + "avg_penalty/before_think": 0.679434284567833, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 678.5, + "completions/max_terminated_length": 613.75, + "completions/mean_length": 258.4375, + "completions/mean_terminated_length": 247.4791717529297, + "completions/min_length": 39.25, + "completions/min_terminated_length": 39.25, + "epoch": 0.992, + "grad_norm": 4.59867525100708, + "kl": 17.8125, + "learning_rate": 4.4013988061597515e-09, + "loss": 1.6621, + "num_tokens": 55430032.0, + "reward": 1.45703125, + "reward_std": 0.8376295119524002, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4713720977306366, + "rewards/tag_count_reward/mean": 0.76953125, + "rewards/tag_count_reward/std": 0.39077553898096085, + "step": 1984, + "token_counts/after_target": 704.5, + "token_counts/after_think": 260.0, + "token_counts/before_target": 2081.0, + "token_counts/before_think": 1089.5 + }, + { + "avg_penalty/after_target": 2.2875793874263763, + "avg_penalty/after_think": 2.6356658339500427, + "avg_penalty/before_target": 0.5172237753868103, + "avg_penalty/before_think": 0.6469371467828751, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 804.75, + "completions/max_terminated_length": 708.75, + "completions/mean_length": 255.75, + "completions/mean_terminated_length": 244.26145935058594, + "completions/min_length": 46.25, + "completions/min_terminated_length": 46.25, + "epoch": 0.9925, + "grad_norm": 6.570420265197754, + "kl": 21.59375, + "learning_rate": 3.898849596456477e-09, + "loss": 2.0443, + "num_tokens": 55456256.0, + "reward": 1.43359375, + "reward_std": 0.8661776781082153, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.46566852182149887, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.4185057133436203, + "step": 1985, + "token_counts/after_target": 992.5, + "token_counts/after_think": 167.75, + "token_counts/before_target": 1939.25, + "token_counts/before_think": 992.5 + }, + { + "avg_penalty/after_target": 2.560661196708679, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.34831996634602547, + "avg_penalty/before_think": 0.6711824238300323, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.25, + "completions/max_terminated_length": 565.25, + "completions/mean_length": 225.796875, + "completions/mean_terminated_length": 225.796875, + "completions/min_length": 57.75, + "completions/min_terminated_length": 57.75, + "epoch": 0.993, + "grad_norm": 6.796959400177002, + "kl": 16.828125, + "learning_rate": 3.4267502444274013e-09, + "loss": 1.6674, + "num_tokens": 55482115.0, + "reward": 1.58203125, + "reward_std": 0.7430315017700195, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4440634250640869, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.3313347399234772, + "step": 1986, + "token_counts/after_target": 529.5, + "token_counts/after_think": 203.25, + "token_counts/before_target": 1588.75, + "token_counts/before_think": 1291.25 + }, + { + "avg_penalty/after_target": 2.038367599248886, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.4040585905313492, + "avg_penalty/before_think": 0.5368029102683067, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 667.5, + "completions/max_terminated_length": 607.0, + "completions/mean_length": 236.09375, + "completions/mean_terminated_length": 225.08021545410156, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.9935, + "grad_norm": 7.576768398284912, + "kl": 15.7578125, + "learning_rate": 2.9851021881688314e-09, + "loss": 1.5484, + "num_tokens": 55507881.0, + "reward": 1.58203125, + "reward_std": 0.7379928380250931, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4260597825050354, + "rewards/tag_count_reward/mean": 0.83203125, + "rewards/tag_count_reward/std": 0.33909450471401215, + "step": 1987, + "token_counts/after_target": 707.25, + "token_counts/after_think": 232.0, + "token_counts/before_target": 1696.5, + "token_counts/before_think": 1141.75 + }, + { + "avg_penalty/after_target": 2.3866028487682343, + "avg_penalty/after_think": 2.748306691646576, + "avg_penalty/before_target": 0.40118274837732315, + "avg_penalty/before_think": 0.47739672660827637, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 583.25, + "completions/max_terminated_length": 583.25, + "completions/mean_length": 216.5625, + "completions/mean_terminated_length": 216.5625, + "completions/min_length": 65.25, + "completions/min_terminated_length": 65.25, + "epoch": 0.994, + "grad_norm": 6.165738582611084, + "kl": 17.1875, + "learning_rate": 2.573906773016832e-09, + "loss": 1.7127, + "num_tokens": 55532749.0, + "reward": 1.640625, + "reward_std": 0.7166961282491684, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.39476002007722855, + "rewards/tag_count_reward/mean": 0.84375, + "rewards/tag_count_reward/std": 0.3413452208042145, + "step": 1988, + "token_counts/after_target": 622.5, + "token_counts/after_think": 185.75, + "token_counts/before_target": 1206.75, + "token_counts/before_think": 1450.0 + }, + { + "avg_penalty/after_target": 1.648655116558075, + "avg_penalty/after_think": 3.737129032611847, + "avg_penalty/before_target": 0.33458593487739563, + "avg_penalty/before_think": 0.43473897129297256, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.25, + "completions/max_terminated_length": 430.25, + "completions/mean_length": 147.9375, + "completions/mean_terminated_length": 147.9375, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.9945, + "grad_norm": 3.0678868293762207, + "kl": 17.921875, + "learning_rate": 2.193165251545004e-09, + "loss": 1.5768, + "num_tokens": 55552761.0, + "reward": 1.5859375, + "reward_std": 0.7965084910392761, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.78125, + "rewards/format_reward/std": 0.4176512807607651, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.38316264748573303, + "step": 1989, + "token_counts/after_target": 182.25, + "token_counts/after_think": 157.25, + "token_counts/before_target": 1115.5, + "token_counts/before_think": 912.0 + }, + { + "avg_penalty/after_target": 2.8592252135276794, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.38981954008340836, + "avg_penalty/before_think": 0.5681178793311119, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.25, + "completions/max_terminated_length": 631.25, + "completions/mean_length": 210.625, + "completions/mean_terminated_length": 210.625, + "completions/min_length": 44.75, + "completions/min_terminated_length": 44.75, + "epoch": 0.995, + "grad_norm": 9.329834938049316, + "kl": 17.828125, + "learning_rate": 1.8428787835578222e-09, + "loss": 1.779, + "num_tokens": 55576113.0, + "reward": 1.6796875, + "reward_std": 0.6847751289606094, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.3758598491549492, + "rewards/tag_count_reward/mean": 0.8515625, + "rewards/tag_count_reward/std": 0.31146082282066345, + "step": 1990, + "token_counts/after_target": 583.75, + "token_counts/after_think": 101.75, + "token_counts/before_target": 1758.75, + "token_counts/before_think": 925.75 + }, + { + "avg_penalty/after_target": 2.682889014482498, + "avg_penalty/after_think": 3.5450714826583862, + "avg_penalty/before_target": 0.3802071288228035, + "avg_penalty/before_think": 0.4644910618662834, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.5, + "completions/max_terminated_length": 513.5, + "completions/mean_length": 234.28125, + "completions/mean_terminated_length": 234.28125, + "completions/min_length": 37.75, + "completions/min_terminated_length": 37.75, + "epoch": 0.9955, + "grad_norm": 2.96360445022583, + "kl": 18.671875, + "learning_rate": 1.5230484360873043e-09, + "loss": 1.5604, + "num_tokens": 55602707.0, + "reward": 1.50390625, + "reward_std": 0.8049957156181335, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.45726002007722855, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.3834269717335701, + "step": 1991, + "token_counts/after_target": 520.0, + "token_counts/after_think": 35.75, + "token_counts/before_target": 2021.75, + "token_counts/before_think": 1171.0 + }, + { + "avg_penalty/after_target": 3.052387058734894, + "avg_penalty/after_think": 4.0, + "avg_penalty/before_target": 0.3403535410761833, + "avg_penalty/before_think": 0.43446990847587585, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 608.25, + "completions/max_terminated_length": 608.25, + "completions/mean_length": 216.828125, + "completions/mean_terminated_length": 216.828125, + "completions/min_length": 47.25, + "completions/min_terminated_length": 47.25, + "epoch": 0.996, + "grad_norm": 5.4885711669921875, + "kl": 19.90625, + "learning_rate": 1.233675183394123e-09, + "loss": 1.8427, + "num_tokens": 55628696.0, + "reward": 1.640625, + "reward_std": 0.6989780068397522, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4361884370446205, + "rewards/tag_count_reward/mean": 0.875, + "rewards/tag_count_reward/std": 0.3106425181031227, + "step": 1992, + "token_counts/after_target": 568.0, + "token_counts/after_think": 55.25, + "token_counts/before_target": 1812.5, + "token_counts/before_think": 1033.5 + }, + { + "avg_penalty/after_target": 2.732794910669327, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.3279414586722851, + "avg_penalty/before_think": 0.4442385248839855, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.25, + "completions/max_terminated_length": 491.25, + "completions/mean_length": 197.6875, + "completions/mean_terminated_length": 197.6875, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.9965, + "grad_norm": 2.4162344932556152, + "kl": 15.890625, + "learning_rate": 9.74759906957612e-10, + "loss": 1.3704, + "num_tokens": 55650820.0, + "reward": 1.53125, + "reward_std": 0.7989966869354248, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.41898179799318314, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.38871651142835617, + "step": 1993, + "token_counts/after_target": 539.25, + "token_counts/after_think": 22.25, + "token_counts/before_target": 1619.25, + "token_counts/before_think": 982.25 + }, + { + "avg_penalty/after_target": 2.785323828458786, + "avg_penalty/after_think": 2.0, + "avg_penalty/before_target": 0.5936207287013531, + "avg_penalty/before_think": 0.5173462368547916, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 735.75, + "completions/max_terminated_length": 735.75, + "completions/mean_length": 251.421875, + "completions/mean_terminated_length": 251.421875, + "completions/min_length": 37.5, + "completions/min_terminated_length": 37.5, + "epoch": 0.997, + "grad_norm": 4.379148006439209, + "kl": 25.8671875, + "learning_rate": 7.463033954802079e-10, + "loss": 2.126, + "num_tokens": 55677023.0, + "reward": 1.4609375, + "reward_std": 0.7309696227312088, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.436277836561203, + "rewards/tag_count_reward/mean": 0.7734375, + "rewards/tag_count_reward/std": 0.31329435110092163, + "step": 1994, + "token_counts/after_target": 1056.25, + "token_counts/after_think": 17.25, + "token_counts/before_target": 1621.5, + "token_counts/before_think": 1327.75 + }, + { + "avg_penalty/after_target": 2.104887068271637, + "avg_penalty/after_think": 2.979274034500122, + "avg_penalty/before_target": 0.5325452536344528, + "avg_penalty/before_think": 0.38810376822948456, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 728.75, + "completions/max_terminated_length": 615.0, + "completions/mean_length": 192.484375, + "completions/mean_terminated_length": 178.9562530517578, + "completions/min_length": 35.75, + "completions/min_terminated_length": 35.75, + "epoch": 0.9975, + "grad_norm": 3.9505016803741455, + "kl": 26.46875, + "learning_rate": 5.483063448785686e-10, + "loss": 2.3299, + "num_tokens": 55700510.0, + "reward": 1.44921875, + "reward_std": 0.836859792470932, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.4423432722687721, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.40571577847003937, + "step": 1995, + "token_counts/after_target": 736.5, + "token_counts/after_think": 129.75, + "token_counts/before_target": 1577.25, + "token_counts/before_think": 636.25 + }, + { + "avg_penalty/after_target": 2.2338200211524963, + "avg_penalty/after_think": 3.9554706811904907, + "avg_penalty/before_target": 0.41932031512260437, + "avg_penalty/before_think": 0.5132484436035156, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 588.5, + "completions/max_terminated_length": 588.5, + "completions/mean_length": 245.765625, + "completions/mean_terminated_length": 245.765625, + "completions/min_length": 57.25, + "completions/min_terminated_length": 57.25, + "epoch": 0.998, + "grad_norm": 2.5490305423736572, + "kl": 17.6640625, + "learning_rate": 3.807693582869032e-10, + "loss": 1.5527, + "num_tokens": 55725295.0, + "reward": 1.56640625, + "reward_std": 0.8003302663564682, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.14789126068353653, + "rewards/format_reward/mean": 0.71875, + "rewards/format_reward/std": 0.43925637751817703, + "rewards/tag_count_reward/mean": 0.80078125, + "rewards/tag_count_reward/std": 0.3503713011741638, + "step": 1996, + "token_counts/after_target": 749.5, + "token_counts/after_think": 153.0, + "token_counts/before_target": 1592.0, + "token_counts/before_think": 1437.75 + }, + { + "avg_penalty/after_target": 2.4915552735328674, + "avg_penalty/after_think": 2.9979295134544373, + "avg_penalty/before_target": 0.32126086577773094, + "avg_penalty/before_think": 0.5018076673150063, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 207.921875, + "completions/mean_terminated_length": 207.921875, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.9985, + "grad_norm": 2.8115031719207764, + "kl": 21.71875, + "learning_rate": 2.436929460525317e-10, + "loss": 1.8366, + "num_tokens": 55747754.0, + "reward": 1.51953125, + "reward_std": 0.7987924516201019, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.4414467439055443, + "rewards/tag_count_reward/mean": 0.78515625, + "rewards/tag_count_reward/std": 0.37961436063051224, + "step": 1997, + "token_counts/after_target": 501.25, + "token_counts/after_think": 185.25, + "token_counts/before_target": 1706.5, + "token_counts/before_think": 933.75 + }, + { + "avg_penalty/after_target": 2.0546545684337616, + "avg_penalty/after_think": 1.714600384235382, + "avg_penalty/before_target": 0.3880779966711998, + "avg_penalty/before_think": 0.42327693849802017, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.25, + "completions/max_terminated_length": 647.25, + "completions/mean_length": 219.59375, + "completions/mean_terminated_length": 219.59375, + "completions/min_length": 47.25, + "completions/min_terminated_length": 47.25, + "epoch": 0.999, + "grad_norm": 6.116604804992676, + "kl": 20.671875, + "learning_rate": 1.3707752573255406e-10, + "loss": 1.6087, + "num_tokens": 55769968.0, + "reward": 1.515625, + "reward_std": 0.8422037065029144, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.45508860796689987, + "rewards/tag_count_reward/mean": 0.78125, + "rewards/tag_count_reward/std": 0.3982590585947037, + "step": 1998, + "token_counts/after_target": 438.0, + "token_counts/after_think": 38.25, + "token_counts/before_target": 1713.0, + "token_counts/before_think": 1324.25 + }, + { + "avg_penalty/after_target": 2.6859639286994934, + "avg_penalty/after_think": 2.7809337377548218, + "avg_penalty/before_target": 0.5189995542168617, + "avg_penalty/before_think": 0.4601953998208046, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 688.25, + "completions/max_terminated_length": 688.25, + "completions/mean_length": 211.265625, + "completions/mean_terminated_length": 211.265625, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.9995, + "grad_norm": 5.107114315032959, + "kl": 22.40625, + "learning_rate": 6.092342209607083e-11, + "loss": 2.1273, + "num_tokens": 55792049.0, + "reward": 1.5703125, + "reward_std": 0.7504388019442558, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.0625, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.41898179799318314, + "rewards/tag_count_reward/mean": 0.8046875, + "rewards/tag_count_reward/std": 0.3115224689245224, + "step": 1999, + "token_counts/after_target": 979.5, + "token_counts/after_think": 21.75, + "token_counts/before_target": 1572.25, + "token_counts/before_think": 806.75 + }, + { + "avg_penalty/after_target": 1.8782471716403961, + "avg_penalty/after_think": 3.0, + "avg_penalty/before_target": 0.58647720515728, + "avg_penalty/before_think": 0.5072036199271679, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.5, + "completions/max_terminated_length": 579.5, + "completions/mean_length": 178.359375, + "completions/mean_terminated_length": 178.359375, + "completions/min_length": 37.5, + "completions/min_terminated_length": 37.5, + "epoch": 1.0, + "grad_norm": 7.125892639160156, + "kl": 22.90625, + "learning_rate": 1.5230867123072757e-11, + "loss": 2.1758, + "num_tokens": 55813288.0, + "reward": 1.5234375, + "reward_std": 0.8259042650461197, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44938503205776215, + "rewards/tag_count_reward/mean": 0.7890625, + "rewards/tag_count_reward/std": 0.3948109373450279, + "step": 2000, + "token_counts/after_target": 649.75, + "token_counts/after_think": 107.5, + "token_counts/before_target": 1469.25, + "token_counts/before_think": 627.25 + }, + { + "epoch": 1.0, + "step": 2000, + "total_flos": 0.0, + "train_loss": 1.5595111853230919, + "train_runtime": 31844.9004, + "train_samples_per_second": 0.251, + "train_steps_per_second": 0.063 + } + ], + "logging_steps": 1, + "max_steps": 2000, + "num_input_tokens_seen": 55813288, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}