{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2929, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.615234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1701.30078125, "completions/mean_terminated_length": 1146.9339599609375, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.31653091311454773, "epoch": 0.0003414134516899966, "frac_reward_zero_std": 0.1875, "grad_norm": 0.09431354127610081, "learning_rate": 0.0, "loss": 0.0865, "num_tokens": 949674.0, "reward": 0.46435546875, "reward_std": 0.22952669858932495, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37646484375, "rewards/tag_count_reward/std": 0.1748575121164322, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.666015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1761.66796875, "completions/mean_terminated_length": 1190.6783447265625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3005528077483177, "epoch": 0.0006828269033799932, "frac_reward_zero_std": 0.1875, "grad_norm": 0.08289377047957829, "learning_rate": 3.4129692832764506e-09, "loss": 0.0605, "num_tokens": 1924880.0, "reward": 0.4033203125, "reward_std": 0.17597538232803345, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3623046875, "rewards/tag_count_reward/std": 0.15666835010051727, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.560546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1679.2265625, "completions/mean_terminated_length": 1208.8355712890625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 0.2905479222536087, "epoch": 0.0010242403550699897, "frac_reward_zero_std": 0.25, "grad_norm": 0.09289899509570086, "learning_rate": 6.825938566552901e-09, "loss": 0.051, "num_tokens": 2866516.0, "reward": 0.43017578125, "reward_std": 0.15755438804626465, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.38330078125, "rewards/tag_count_reward/std": 0.1538146436214447, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.68359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1769.6015625, "completions/mean_terminated_length": 1168.1234130859375, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "entropy": 0.30692359805107117, "epoch": 0.0013656538067599864, "frac_reward_zero_std": 0.09375, "grad_norm": 0.09350149566088037, "learning_rate": 1.023890784982935e-08, "loss": 0.061, "num_tokens": 3852952.0, "reward": 0.40771484375, "reward_std": 0.17636051774024963, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.36474609375, "rewards/tag_count_reward/std": 0.18008024990558624, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.544921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1636.400390625, "completions/mean_terminated_length": 1143.540771484375, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "entropy": 0.3029249608516693, "epoch": 0.001707067258449983, "frac_reward_zero_std": 0.0625, "grad_norm": 0.09379504290923228, "learning_rate": 1.3651877133105802e-08, "loss": 0.0604, "num_tokens": 4772293.0, "reward": 0.5302734375, "reward_std": 0.2438577562570572, "rewards/accuracy_reward/mean": 0.13306452333927155, "rewards/accuracy_reward/std": 0.3399873673915863, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4013671875, "rewards/tag_count_reward/std": 0.1692856103181839, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.544921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1626.189453125, "completions/mean_terminated_length": 1121.10302734375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.321226105093956, "epoch": 0.0020484807101399795, "frac_reward_zero_std": 0.03125, "grad_norm": 0.10878747887059917, "learning_rate": 1.706484641638225e-08, "loss": 0.1042, "num_tokens": 5692182.0, "reward": 0.4287109375, "reward_std": 0.18328802287578583, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3837890625, "rewards/tag_count_reward/std": 0.16230030357837677, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.630859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1724.513671875, "completions/mean_terminated_length": 1171.67724609375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "entropy": 0.3103385865688324, "epoch": 0.002389894161829976, "frac_reward_zero_std": 0.09375, "grad_norm": 0.09715143958888799, "learning_rate": 2.04778156996587e-08, "loss": 0.0943, "num_tokens": 6647517.0, "reward": 0.4013671875, "reward_std": 0.15177610516548157, "rewards/accuracy_reward/mean": 0.030241934582591057, "rewards/accuracy_reward/std": 0.1714252382516861, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3720703125, "rewards/tag_count_reward/std": 0.1669667512178421, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.537109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1701.44921875, "completions/mean_terminated_length": 1299.333251953125, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "entropy": 0.3014408051967621, "epoch": 0.0027313076135199728, "frac_reward_zero_std": 0.0625, "grad_norm": 0.09747423808198794, "learning_rate": 2.3890784982935154e-08, "loss": 0.1036, "num_tokens": 7595363.0, "reward": 0.52880859375, "reward_std": 0.26373738050460815, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.39990234375, "rewards/tag_count_reward/std": 0.152811199426651, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 1673.47265625, "completions/mean_terminated_length": 1126.0865478515625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.32441169768571854, "epoch": 0.0030727210652099694, "frac_reward_zero_std": 0.09375, "grad_norm": 0.09892535414811007, "learning_rate": 2.7303754266211605e-08, "loss": 0.0967, "num_tokens": 8525973.0, "reward": 0.43017578125, "reward_std": 0.18833193182945251, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.38330078125, "rewards/tag_count_reward/std": 0.17041288316249847, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.62109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1710.107421875, "completions/mean_terminated_length": 1156.2421875, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "entropy": 0.2831055819988251, "epoch": 0.003414134516899966, "frac_reward_zero_std": 0.21875, "grad_norm": 0.08364345420025317, "learning_rate": 3.071672354948805e-08, "loss": 0.0728, "num_tokens": 9473484.0, "reward": 0.4033203125, "reward_std": 0.14928041398525238, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3720703125, "rewards/tag_count_reward/std": 0.1548031121492386, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1740.412109375, "completions/mean_terminated_length": 1163.2528076171875, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "entropy": 0.28812113404273987, "epoch": 0.0037555479685899623, "frac_reward_zero_std": 0.15625, "grad_norm": 0.08569064231348945, "learning_rate": 3.41296928327645e-08, "loss": 0.0661, "num_tokens": 10440159.0, "reward": 0.41357421875, "reward_std": 0.17004826664924622, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.36279296875, "rewards/tag_count_reward/std": 0.15275490283966064, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1687.376953125, "completions/mean_terminated_length": 1272.2059326171875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 0.29352451115846634, "epoch": 0.004096961420279959, "frac_reward_zero_std": 0.03125, "grad_norm": 0.09942272660954922, "learning_rate": 3.754266211604096e-08, "loss": 0.0859, "num_tokens": 11375472.0, "reward": 0.484375, "reward_std": 0.23423391580581665, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.400390625, "rewards/tag_count_reward/std": 0.16725550591945648, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1676.244140625, "completions/mean_terminated_length": 1056.651123046875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.3282046541571617, "epoch": 0.004438374871969956, "frac_reward_zero_std": 0.09375, "grad_norm": 0.1024651407044004, "learning_rate": 4.09556313993174e-08, "loss": 0.1078, "num_tokens": 12309517.0, "reward": 0.46044921875, "reward_std": 0.2169840931892395, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.36474609375, "rewards/tag_count_reward/std": 0.15762527287006378, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1722.857421875, "completions/mean_terminated_length": 1262.75, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.30098913609981537, "epoch": 0.004779788323659952, "frac_reward_zero_std": 0.15625, "grad_norm": 0.08798598203817233, "learning_rate": 4.436860068259386e-08, "loss": 0.0925, "num_tokens": 13268884.0, "reward": 0.4462890625, "reward_std": 0.18905583024024963, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3779296875, "rewards/tag_count_reward/std": 0.15401098132133484, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.509765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1596.919921875, "completions/mean_terminated_length": 1127.8685302734375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.2823268100619316, "epoch": 0.005121201775349949, "frac_reward_zero_std": 0.03125, "grad_norm": 0.0963110161905486, "learning_rate": 4.778156996587031e-08, "loss": 0.102, "num_tokens": 14163659.0, "reward": 0.515625, "reward_std": 0.25159594416618347, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.412109375, "rewards/tag_count_reward/std": 0.18985998630523682, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.58203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1655.3046875, "completions/mean_terminated_length": 1108.46728515625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.30971919000148773, "epoch": 0.0054626152270399455, "frac_reward_zero_std": 0.1875, "grad_norm": 0.08435601205291995, "learning_rate": 5.119453924914675e-08, "loss": 0.0761, "num_tokens": 15086343.0, "reward": 0.50830078125, "reward_std": 0.2241826355457306, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.36962890625, "rewards/tag_count_reward/std": 0.14065982401371002, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1651.330078125, "completions/mean_terminated_length": 1124.8408203125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.3112441524863243, "epoch": 0.005804028678729942, "frac_reward_zero_std": 0.25, "grad_norm": 0.09107564914966042, "learning_rate": 5.460750853242321e-08, "loss": 0.0813, "num_tokens": 16007552.0, "reward": 0.46240234375, "reward_std": 0.20488740503787994, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37841796875, "rewards/tag_count_reward/std": 0.15079060196876526, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.666015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1759.646484375, "completions/mean_terminated_length": 1184.625732421875, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "entropy": 0.31539302319288254, "epoch": 0.006145442130419939, "frac_reward_zero_std": 0.125, "grad_norm": 0.08815215936099594, "learning_rate": 5.802047781569966e-08, "loss": 0.0865, "num_tokens": 16991451.0, "reward": 0.400390625, "reward_std": 0.18520498275756836, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.353515625, "rewards/tag_count_reward/std": 0.15958404541015625, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.642578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1753.958984375, "completions/mean_terminated_length": 1225.327880859375, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "entropy": 0.288417287170887, "epoch": 0.0064868555821099355, "frac_reward_zero_std": 0.1875, "grad_norm": 0.08179752066444493, "learning_rate": 6.14334470989761e-08, "loss": 0.0681, "num_tokens": 17966822.0, "reward": 0.435546875, "reward_std": 0.19754618406295776, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.369140625, "rewards/tag_count_reward/std": 0.16615498065948486, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.60546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1687.39453125, "completions/mean_terminated_length": 1133.9901123046875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.29838598519563675, "epoch": 0.006828269033799932, "frac_reward_zero_std": 0.125, "grad_norm": 0.09589678760496248, "learning_rate": 6.484641638225255e-08, "loss": 0.081, "num_tokens": 18907984.0, "reward": 0.46142578125, "reward_std": 0.18289224803447723, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37939453125, "rewards/tag_count_reward/std": 0.1698399931192398, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1664.306640625, "completions/mean_terminated_length": 1155.0408935546875, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.30119046568870544, "epoch": 0.007169682485489928, "frac_reward_zero_std": 0.09375, "grad_norm": 0.10156625711728379, "learning_rate": 6.8259385665529e-08, "loss": 0.0806, "num_tokens": 19833469.0, "reward": 0.521484375, "reward_std": 0.2385271191596985, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.38671875, "rewards/tag_count_reward/std": 0.1730620265007019, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1737.958984375, "completions/mean_terminated_length": 1221.2239990234375, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "entropy": 0.29330387711524963, "epoch": 0.0075110959371799246, "frac_reward_zero_std": 0.15625, "grad_norm": 0.08644912078792234, "learning_rate": 7.167235494880546e-08, "loss": 0.0661, "num_tokens": 20806216.0, "reward": 0.48876953125, "reward_std": 0.2353227287530899, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.36962890625, "rewards/tag_count_reward/std": 0.16093677282333374, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1786.705078125, "completions/mean_terminated_length": 1251.672607421875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.31794312596321106, "epoch": 0.007852509388869921, "frac_reward_zero_std": 0.09375, "grad_norm": 0.08823804226640387, "learning_rate": 7.508532423208192e-08, "loss": 0.077, "num_tokens": 21792673.0, "reward": 0.4150390625, "reward_std": 0.21195906400680542, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3681640625, "rewards/tag_count_reward/std": 0.16758368909358978, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1659.55078125, "completions/mean_terminated_length": 1082.533935546875, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "entropy": 0.3427746072411537, "epoch": 0.008193922840559918, "frac_reward_zero_std": 0.15625, "grad_norm": 0.09780582205048353, "learning_rate": 7.849829351535836e-08, "loss": 0.0955, "num_tokens": 22721979.0, "reward": 0.43896484375, "reward_std": 0.14896361529827118, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37646484375, "rewards/tag_count_reward/std": 0.16771692037582397, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.58203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1737.48828125, "completions/mean_terminated_length": 1305.0933837890625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.2817371040582657, "epoch": 0.008535336292249915, "frac_reward_zero_std": 0.15625, "grad_norm": 0.08664956765399244, "learning_rate": 8.19112627986348e-08, "loss": 0.0747, "num_tokens": 23688677.0, "reward": 0.44921875, "reward_std": 0.18252810835838318, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.38671875, "rewards/tag_count_reward/std": 0.1723538488149643, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.54296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1604.16796875, "completions/mean_terminated_length": 1076.88037109375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.32713863998651505, "epoch": 0.008876749743939911, "frac_reward_zero_std": 0.0625, "grad_norm": 0.09805986112473711, "learning_rate": 8.532423208191126e-08, "loss": 0.0766, "num_tokens": 24588555.0, "reward": 0.5185546875, "reward_std": 0.28301236033439636, "rewards/accuracy_reward/mean": 0.1270161271095276, "rewards/accuracy_reward/std": 0.3333272337913513, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3955078125, "rewards/tag_count_reward/std": 0.17224015295505524, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.525390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1572.84765625, "completions/mean_terminated_length": 1046.85595703125, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.3076319396495819, "epoch": 0.009218163195629908, "frac_reward_zero_std": 0.125, "grad_norm": 0.09261291243550486, "learning_rate": 8.873720136518772e-08, "loss": 0.0931, "num_tokens": 25463133.0, "reward": 0.431640625, "reward_std": 0.18853332102298737, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.400390625, "rewards/tag_count_reward/std": 0.17719781398773193, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.626953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1727.17578125, "completions/mean_terminated_length": 1187.989501953125, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "entropy": 0.3329521715641022, "epoch": 0.009559576647319904, "frac_reward_zero_std": 0.125, "grad_norm": 0.09683061460019309, "learning_rate": 9.215017064846416e-08, "loss": 0.0802, "num_tokens": 26436535.0, "reward": 0.40625, "reward_std": 0.18102425336837769, "rewards/accuracy_reward/mean": 0.04838709533214569, "rewards/accuracy_reward/std": 0.21479946374893188, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.359375, "rewards/tag_count_reward/std": 0.14334554970264435, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.642578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1734.51171875, "completions/mean_terminated_length": 1170.91796875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.3126542940735817, "epoch": 0.009900990099009901, "frac_reward_zero_std": 0.09375, "grad_norm": 0.09401549884241026, "learning_rate": 9.556313993174062e-08, "loss": 0.0901, "num_tokens": 27399869.0, "reward": 0.4384765625, "reward_std": 0.19684988260269165, "rewards/accuracy_reward/mean": 0.07661290466785431, "rewards/accuracy_reward/std": 0.2662447690963745, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3642578125, "rewards/tag_count_reward/std": 0.15681466460227966, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.623046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1699.01171875, "completions/mean_terminated_length": 1122.1865234375, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "entropy": 0.29206109791994095, "epoch": 0.010242403550699898, "frac_reward_zero_std": 0.3125, "grad_norm": 0.07815274823325402, "learning_rate": 9.897610921501706e-08, "loss": 0.0544, "num_tokens": 28349731.0, "reward": 0.4580078125, "reward_std": 0.1374412626028061, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3642578125, "rewards/tag_count_reward/std": 0.15206290781497955, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.55859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1663.91015625, "completions/mean_terminated_length": 1177.849609375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "entropy": 0.334923155605793, "epoch": 0.010583817002389894, "frac_reward_zero_std": 0.15625, "grad_norm": 0.09649619116924545, "learning_rate": 1.023890784982935e-07, "loss": 0.0786, "num_tokens": 29280437.0, "reward": 0.486328125, "reward_std": 0.22537781298160553, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.39453125, "rewards/tag_count_reward/std": 0.17656034231185913, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1710.26171875, "completions/mean_terminated_length": 1261.9908447265625, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "entropy": 0.3137167617678642, "epoch": 0.010925230454079891, "frac_reward_zero_std": 0.125, "grad_norm": 0.09689005487474657, "learning_rate": 1.0580204778156996e-07, "loss": 0.0928, "num_tokens": 30233515.0, "reward": 0.44384765625, "reward_std": 0.21052563190460205, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.38330078125, "rewards/tag_count_reward/std": 0.15460777282714844, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.58984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1654.958984375, "completions/mean_terminated_length": 1089.7286376953125, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.3079943507909775, "epoch": 0.011266643905769888, "frac_reward_zero_std": 0.125, "grad_norm": 0.09865187385978204, "learning_rate": 1.0921501706484642e-07, "loss": 0.0804, "num_tokens": 31165590.0, "reward": 0.42822265625, "reward_std": 0.19113923609256744, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37158203125, "rewards/tag_count_reward/std": 0.15159955620765686, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.55859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1697.7734375, "completions/mean_terminated_length": 1254.56640625, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "entropy": 0.33544880896806717, "epoch": 0.011608057357459884, "frac_reward_zero_std": 0.15625, "grad_norm": 0.09762493202364121, "learning_rate": 1.1262798634812286e-07, "loss": 0.0931, "num_tokens": 32108130.0, "reward": 0.4931640625, "reward_std": 0.2264755666255951, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3935546875, "rewards/tag_count_reward/std": 0.1652178168296814, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.556640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1624.607421875, "completions/mean_terminated_length": 1093.03515625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "entropy": 0.2975941374897957, "epoch": 0.011949470809149881, "frac_reward_zero_std": 0.0625, "grad_norm": 0.0949259155307809, "learning_rate": 1.1604095563139932e-07, "loss": 0.0835, "num_tokens": 33021945.0, "reward": 0.52734375, "reward_std": 0.24892792105674744, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3984375, "rewards/tag_count_reward/std": 0.17608344554901123, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.61328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1732.669921875, "completions/mean_terminated_length": 1232.6009521484375, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "entropy": 0.32126346975564957, "epoch": 0.012290884260839878, "frac_reward_zero_std": 0.1875, "grad_norm": 0.08829810328097695, "learning_rate": 1.1945392491467578e-07, "loss": 0.0868, "num_tokens": 33992768.0, "reward": 0.43212890625, "reward_std": 0.1878102421760559, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37353515625, "rewards/tag_count_reward/std": 0.166252002120018, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.646484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1782.966796875, "completions/mean_terminated_length": 1298.2928466796875, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "entropy": 0.3144604191184044, "epoch": 0.012632297712529874, "frac_reward_zero_std": 0.125, "grad_norm": 0.09402831249321719, "learning_rate": 1.228668941979522e-07, "loss": 0.0641, "num_tokens": 34980591.0, "reward": 0.42626953125, "reward_std": 0.1663297712802887, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.35986328125, "rewards/tag_count_reward/std": 0.135504350066185, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1696.7890625, "completions/mean_terminated_length": 1183.4808349609375, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "entropy": 0.30845295637845993, "epoch": 0.012973711164219871, "frac_reward_zero_std": 0.125, "grad_norm": 0.0903541107622616, "learning_rate": 1.2627986348122866e-07, "loss": 0.0901, "num_tokens": 35930067.0, "reward": 0.42578125, "reward_std": 0.19087107479572296, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.388671875, "rewards/tag_count_reward/std": 0.18188132345676422, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.611328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1721.865234375, "completions/mean_terminated_length": 1208.8995361328125, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "entropy": 0.30715613812208176, "epoch": 0.013315124615909868, "frac_reward_zero_std": 0.1875, "grad_norm": 0.08981194804751812, "learning_rate": 1.296928327645051e-07, "loss": 0.0808, "num_tokens": 36889566.0, "reward": 0.46484375, "reward_std": 0.20999453961849213, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37890625, "rewards/tag_count_reward/std": 0.17411890625953674, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1780.396484375, "completions/mean_terminated_length": 1232.4464111328125, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "entropy": 0.29111943393945694, "epoch": 0.013656538067599864, "frac_reward_zero_std": 0.21875, "grad_norm": 0.08106195366291308, "learning_rate": 1.3310580204778158e-07, "loss": 0.0953, "num_tokens": 37881721.0, "reward": 0.38671875, "reward_std": 0.1475594937801361, "rewards/accuracy_reward/mean": 0.033203125, "rewards/accuracy_reward/std": 0.17934183776378632, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.353515625, "rewards/tag_count_reward/std": 0.15412573516368866, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.572265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1699.578125, "completions/mean_terminated_length": 1233.424560546875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.3249722197651863, "epoch": 0.01399795151928986, "frac_reward_zero_std": 0.1875, "grad_norm": 0.08735588911364883, "learning_rate": 1.36518771331058e-07, "loss": 0.0719, "num_tokens": 38822769.0, "reward": 0.44189453125, "reward_std": 0.19241851568222046, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.38720703125, "rewards/tag_count_reward/std": 0.17302820086479187, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1741.001953125, "completions/mean_terminated_length": 1112.386962890625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.33407432585954666, "epoch": 0.014339364970979856, "frac_reward_zero_std": 0.09375, "grad_norm": 0.09191891575868615, "learning_rate": 1.3993174061433446e-07, "loss": 0.0884, "num_tokens": 39799522.0, "reward": 0.4384765625, "reward_std": 0.2225104570388794, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3623046875, "rewards/tag_count_reward/std": 0.17579832673072815, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.66015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1760.171875, "completions/mean_terminated_length": 1201.0574951171875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.29202771186828613, "epoch": 0.014680778422669852, "frac_reward_zero_std": 0.25, "grad_norm": 0.08296587992826984, "learning_rate": 1.4334470989761092e-07, "loss": 0.0566, "num_tokens": 40774330.0, "reward": 0.42041015625, "reward_std": 0.17149165272712708, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.35400390625, "rewards/tag_count_reward/std": 0.14604566991329193, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.58203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1684.68359375, "completions/mean_terminated_length": 1178.7569580078125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.30224334448575974, "epoch": 0.015022191874359849, "frac_reward_zero_std": 0.15625, "grad_norm": 0.08928212522613092, "learning_rate": 1.4675767918088735e-07, "loss": 0.0628, "num_tokens": 41707096.0, "reward": 0.4814453125, "reward_std": 0.2258215844631195, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3798828125, "rewards/tag_count_reward/std": 0.16246506571769714, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1720.970703125, "completions/mean_terminated_length": 1243.0048828125, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "entropy": 0.30127714574337006, "epoch": 0.015363605326049846, "frac_reward_zero_std": 0.21875, "grad_norm": 0.08164447655440359, "learning_rate": 1.5017064846416383e-07, "loss": 0.074, "num_tokens": 42669993.0, "reward": 0.42578125, "reward_std": 0.15132513642311096, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37890625, "rewards/tag_count_reward/std": 0.15945225954055786, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1764.048828125, "completions/mean_terminated_length": 1274.6861572265625, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "entropy": 0.3226234093308449, "epoch": 0.015705018777739842, "frac_reward_zero_std": 0.1875, "grad_norm": 0.08676075843154674, "learning_rate": 1.5358361774744026e-07, "loss": 0.0782, "num_tokens": 43647634.0, "reward": 0.46533203125, "reward_std": 0.21537040174007416, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37548828125, "rewards/tag_count_reward/std": 0.17061462998390198, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.615234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1724.103515625, "completions/mean_terminated_length": 1206.197998046875, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "entropy": 0.299176961183548, "epoch": 0.01604643222942984, "frac_reward_zero_std": 0.28125, "grad_norm": 0.08000143797299085, "learning_rate": 1.5699658703071672e-07, "loss": 0.0618, "num_tokens": 44606215.0, "reward": 0.439453125, "reward_std": 0.13332784175872803, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37890625, "rewards/tag_count_reward/std": 0.16985194385051727, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.697265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1820.1171875, "completions/mean_terminated_length": 1295.2515869140625, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.3038352280855179, "epoch": 0.016387845681119836, "frac_reward_zero_std": 0.1875, "grad_norm": 0.08281062464693738, "learning_rate": 1.6040955631399318e-07, "loss": 0.0657, "num_tokens": 45618819.0, "reward": 0.400390625, "reward_std": 0.14732903242111206, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.353515625, "rewards/tag_count_reward/std": 0.15091808140277863, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.619140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1710.6015625, "completions/mean_terminated_length": 1162.1129150390625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2973631024360657, "epoch": 0.016729259132809832, "frac_reward_zero_std": 0.1875, "grad_norm": 0.08804579707170518, "learning_rate": 1.638225255972696e-07, "loss": 0.0687, "num_tokens": 46568039.0, "reward": 0.42724609375, "reward_std": 0.16326695680618286, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.36865234375, "rewards/tag_count_reward/std": 0.15231013298034668, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.52734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1646.607421875, "completions/mean_terminated_length": 1198.772705078125, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "entropy": 0.29988065361976624, "epoch": 0.01707067258449983, "frac_reward_zero_std": 0.03125, "grad_norm": 0.0938529122260617, "learning_rate": 1.6723549488054606e-07, "loss": 0.0867, "num_tokens": 47479166.0, "reward": 0.49951171875, "reward_std": 0.2066783607006073, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.39794921875, "rewards/tag_count_reward/std": 0.17614787817001343, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.587890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1723.61328125, "completions/mean_terminated_length": 1260.862548828125, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "entropy": 0.31749942898750305, "epoch": 0.017412086036189826, "frac_reward_zero_std": 0.125, "grad_norm": 0.09439026246169695, "learning_rate": 1.7064846416382252e-07, "loss": 0.0896, "num_tokens": 48440888.0, "reward": 0.4794921875, "reward_std": 0.21595461666584015, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3818359375, "rewards/tag_count_reward/std": 0.1719069629907608, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.576171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1659.083984375, "completions/mean_terminated_length": 1130.373291015625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.3071504086256027, "epoch": 0.017753499487879822, "frac_reward_zero_std": 0.125, "grad_norm": 0.09523045826138105, "learning_rate": 1.7406143344709898e-07, "loss": 0.0796, "num_tokens": 49368659.0, "reward": 0.443359375, "reward_std": 0.19807878136634827, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.39453125, "rewards/tag_count_reward/std": 0.16949151456356049, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.61328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1710.994140625, "completions/mean_terminated_length": 1176.550537109375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.31784167140722275, "epoch": 0.01809491293956982, "frac_reward_zero_std": 0.125, "grad_norm": 0.09214157588401625, "learning_rate": 1.7747440273037543e-07, "loss": 0.0809, "num_tokens": 50318848.0, "reward": 0.453125, "reward_std": 0.1960989534854889, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.376953125, "rewards/tag_count_reward/std": 0.16177250444889069, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1595.994140625, "completions/mean_terminated_length": 1067.3770751953125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.30455347895622253, "epoch": 0.018436326391259816, "frac_reward_zero_std": 0.125, "grad_norm": 0.09266645080986674, "learning_rate": 1.8088737201365186e-07, "loss": 0.0847, "num_tokens": 51219005.0, "reward": 0.4912109375, "reward_std": 0.2202983945608139, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4072265625, "rewards/tag_count_reward/std": 0.1926652193069458, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.595703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1745.6796875, "completions/mean_terminated_length": 1300.2318115234375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.2953370288014412, "epoch": 0.018777739842949812, "frac_reward_zero_std": 0.1875, "grad_norm": 0.08647427082218719, "learning_rate": 1.8430034129692832e-07, "loss": 0.0857, "num_tokens": 52185657.0, "reward": 0.43359375, "reward_std": 0.200666606426239, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.384765625, "rewards/tag_count_reward/std": 0.16523228585720062, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1529.83984375, "completions/mean_terminated_length": 1100.5072021484375, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "entropy": 0.34059522300958633, "epoch": 0.01911915329463981, "frac_reward_zero_std": 0.0625, "grad_norm": 0.10436022880480439, "learning_rate": 1.8771331058020475e-07, "loss": 0.0796, "num_tokens": 53046135.0, "reward": 0.5166015625, "reward_std": 0.24053695797920227, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4130859375, "rewards/tag_count_reward/std": 0.1677660495042801, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.58984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1737.2109375, "completions/mean_terminated_length": 1290.2667236328125, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "entropy": 0.32681455463171005, "epoch": 0.019460566746329806, "frac_reward_zero_std": 0.03125, "grad_norm": 0.09481013594350139, "learning_rate": 1.9112627986348124e-07, "loss": 0.0812, "num_tokens": 54013507.0, "reward": 0.43359375, "reward_std": 0.1983286440372467, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37890625, "rewards/tag_count_reward/std": 0.15945225954055786, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1824.1875, "completions/mean_terminated_length": 1365.90478515625, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "entropy": 0.30665700882673264, "epoch": 0.019801980198019802, "frac_reward_zero_std": 0.21875, "grad_norm": 0.0843579403614099, "learning_rate": 1.9453924914675767e-07, "loss": 0.0789, "num_tokens": 55039827.0, "reward": 0.40966796875, "reward_std": 0.18145982921123505, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.35302734375, "rewards/tag_count_reward/std": 0.15326076745986938, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1778.169921875, "completions/mean_terminated_length": 1184.5438232421875, "completions/min_length": 507.0, "completions/min_terminated_length": 507.0, "entropy": 0.2964690774679184, "epoch": 0.0201433936497098, "frac_reward_zero_std": 0.1875, "grad_norm": 0.08172363446728816, "learning_rate": 1.9795221843003412e-07, "loss": 0.0954, "num_tokens": 56018682.0, "reward": 0.41796875, "reward_std": 0.18680137395858765, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.349609375, "rewards/tag_count_reward/std": 0.1597755402326584, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.537109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1617.96875, "completions/mean_terminated_length": 1118.9873046875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.30621494352817535, "epoch": 0.020484807101399796, "frac_reward_zero_std": 0.03125, "grad_norm": 0.10161229179778156, "learning_rate": 2.0136518771331058e-07, "loss": 0.1011, "num_tokens": 56930506.0, "reward": 0.435546875, "reward_std": 0.19600114226341248, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.39453125, "rewards/tag_count_reward/std": 0.17021159827709198, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.587890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1721.10546875, "completions/mean_terminated_length": 1254.77734375, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "entropy": 0.2876248136162758, "epoch": 0.020826220553089792, "frac_reward_zero_std": 0.25, "grad_norm": 0.07861943539354363, "learning_rate": 2.04778156996587e-07, "loss": 0.0586, "num_tokens": 57885840.0, "reward": 0.46533203125, "reward_std": 0.20033249258995056, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.38720703125, "rewards/tag_count_reward/std": 0.17231987416744232, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.42578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1556.396484375, "completions/mean_terminated_length": 1191.8741455078125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.32752181589603424, "epoch": 0.02116763400477979, "frac_reward_zero_std": 0.03125, "grad_norm": 0.10889129948316872, "learning_rate": 2.0819112627986347e-07, "loss": 0.1063, "num_tokens": 58756651.0, "reward": 0.55322265625, "reward_std": 0.2352120280265808, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.42236328125, "rewards/tag_count_reward/std": 0.1661197543144226, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.48046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1557.140625, "completions/mean_terminated_length": 1103.18798828125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.3179500252008438, "epoch": 0.021509047456469785, "frac_reward_zero_std": 0.125, "grad_norm": 0.10621411559172043, "learning_rate": 2.1160409556313992e-07, "loss": 0.0887, "num_tokens": 59636963.0, "reward": 0.4931640625, "reward_std": 0.19571489095687866, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4111328125, "rewards/tag_count_reward/std": 0.17877350747585297, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.498046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1603.806640625, "completions/mean_terminated_length": 1163.070068359375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.3006089851260185, "epoch": 0.021850460908159782, "frac_reward_zero_std": 0.0625, "grad_norm": 0.09495047459619776, "learning_rate": 2.1501706484641638e-07, "loss": 0.0843, "num_tokens": 60537040.0, "reward": 0.5166015625, "reward_std": 0.25184354186058044, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4169921875, "rewards/tag_count_reward/std": 0.17470784485340118, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.537109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1616.056640625, "completions/mean_terminated_length": 1114.8564453125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.3109700530767441, "epoch": 0.02219187435984978, "frac_reward_zero_std": 0.125, "grad_norm": 0.09574080038474271, "learning_rate": 2.1843003412969284e-07, "loss": 0.0785, "num_tokens": 61444589.0, "reward": 0.49462890625, "reward_std": 0.21358317136764526, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.39697265625, "rewards/tag_count_reward/std": 0.1655377745628357, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.552734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1656.8828125, "completions/mean_terminated_length": 1173.537109375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.28578998148441315, "epoch": 0.022533287811539775, "frac_reward_zero_std": 0.0, "grad_norm": 0.0987604080096604, "learning_rate": 2.2184300341296927e-07, "loss": 0.0828, "num_tokens": 62370273.0, "reward": 0.43994140625, "reward_std": 0.1837189495563507, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.39306640625, "rewards/tag_count_reward/std": 0.16453032195568085, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.642578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1754.1875, "completions/mean_terminated_length": 1225.9671630859375, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "entropy": 0.3208107650279999, "epoch": 0.022874701263229772, "frac_reward_zero_std": 0.15625, "grad_norm": 0.0893545954230494, "learning_rate": 2.2525597269624572e-07, "loss": 0.08, "num_tokens": 63336897.0, "reward": 0.43994140625, "reward_std": 0.1818157434463501, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.36767578125, "rewards/tag_count_reward/std": 0.15701180696487427, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1722.03515625, "completions/mean_terminated_length": 1120.8111572265625, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "entropy": 0.33585531264543533, "epoch": 0.02321611471491977, "frac_reward_zero_std": 0.21875, "grad_norm": 0.09732734454434964, "learning_rate": 2.2866894197952215e-07, "loss": 0.0911, "num_tokens": 64295875.0, "reward": 0.45849609375, "reward_std": 0.20117489993572235, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37451171875, "rewards/tag_count_reward/std": 0.18768273293972015, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.720703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1844.44140625, "completions/mean_terminated_length": 1319.1748046875, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "entropy": 0.31821244210004807, "epoch": 0.023557528166609765, "frac_reward_zero_std": 0.15625, "grad_norm": 0.08672331225770104, "learning_rate": 2.3208191126279864e-07, "loss": 0.0758, "num_tokens": 65324645.0, "reward": 0.365234375, "reward_std": 0.14191332459449768, "rewards/accuracy_reward/mean": 0.01411290280520916, "rewards/accuracy_reward/std": 0.11807556450366974, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3515625, "rewards/tag_count_reward/std": 0.15776540338993073, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.517578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1644.623046875, "completions/mean_terminated_length": 1211.8502197265625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.31963393092155457, "epoch": 0.023898941618299762, "frac_reward_zero_std": 0.0625, "grad_norm": 0.0965116101869191, "learning_rate": 2.354948805460751e-07, "loss": 0.1017, "num_tokens": 66245332.0, "reward": 0.50537109375, "reward_std": 0.22031089663505554, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.40380859375, "rewards/tag_count_reward/std": 0.16888649761676788, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.615234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1735.1875, "completions/mean_terminated_length": 1235.0050048828125, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "entropy": 0.32154613733291626, "epoch": 0.02424035506998976, "frac_reward_zero_std": 0.125, "grad_norm": 0.09103440045635448, "learning_rate": 2.3890784982935155e-07, "loss": 0.0654, "num_tokens": 67209876.0, "reward": 0.42724609375, "reward_std": 0.1832031011581421, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37646484375, "rewards/tag_count_reward/std": 0.16177767515182495, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.56640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1657.087890625, "completions/mean_terminated_length": 1146.43701171875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.33111153542995453, "epoch": 0.024581768521679755, "frac_reward_zero_std": 0.09375, "grad_norm": 0.10113638223892449, "learning_rate": 2.42320819112628e-07, "loss": 0.0876, "num_tokens": 68144433.0, "reward": 0.46142578125, "reward_std": 0.1868152767419815, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.40283203125, "rewards/tag_count_reward/std": 0.17957545816898346, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 1568.4609375, "completions/mean_terminated_length": 1057.98388671875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.32604753226041794, "epoch": 0.024923181973369752, "frac_reward_zero_std": 0.125, "grad_norm": 0.09949251389976838, "learning_rate": 2.457337883959044e-07, "loss": 0.0744, "num_tokens": 69033837.0, "reward": 0.4501953125, "reward_std": 0.1610129177570343, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3955078125, "rewards/tag_count_reward/std": 0.15186168253421783, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1626.583984375, "completions/mean_terminated_length": 1141.4244384765625, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "entropy": 0.33295150846242905, "epoch": 0.02526459542505975, "frac_reward_zero_std": 0.1875, "grad_norm": 0.09263141577581939, "learning_rate": 2.4914675767918084e-07, "loss": 0.0866, "num_tokens": 69940520.0, "reward": 0.5, "reward_std": 0.21913942694664001, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.396484375, "rewards/tag_count_reward/std": 0.167072594165802, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1750.052734375, "completions/mean_terminated_length": 1218.9293212890625, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "entropy": 0.3089717626571655, "epoch": 0.025606008876749745, "frac_reward_zero_std": 0.125, "grad_norm": 0.09458391677171006, "learning_rate": 2.525597269624573e-07, "loss": 0.0768, "num_tokens": 70916547.0, "reward": 0.40869140625, "reward_std": 0.18127226829528809, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.36376953125, "rewards/tag_count_reward/std": 0.16290192306041718, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.56640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1655.30859375, "completions/mean_terminated_length": 1142.3333740234375, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "entropy": 0.30182696878910065, "epoch": 0.025947422328439742, "frac_reward_zero_std": 0.125, "grad_norm": 0.0877331212347654, "learning_rate": 2.5597269624573375e-07, "loss": 0.0644, "num_tokens": 71833745.0, "reward": 0.48828125, "reward_std": 0.21789415180683136, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.39453125, "rewards/tag_count_reward/std": 0.17446976900100708, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.669921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1764.404296875, "completions/mean_terminated_length": 1188.822509765625, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "entropy": 0.34114911407232285, "epoch": 0.02628883578012974, "frac_reward_zero_std": 0.25, "grad_norm": 0.08618105852829192, "learning_rate": 2.593856655290102e-07, "loss": 0.0888, "num_tokens": 72814624.0, "reward": 0.40185546875, "reward_std": 0.1211591362953186, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.35498046875, "rewards/tag_count_reward/std": 0.15111975371837616, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.494140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1625.62109375, "completions/mean_terminated_length": 1213.0269775390625, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "entropy": 0.3294278681278229, "epoch": 0.026630249231819735, "frac_reward_zero_std": 0.0625, "grad_norm": 0.10711088866844821, "learning_rate": 2.627986348122867e-07, "loss": 0.0806, "num_tokens": 73728254.0, "reward": 0.50830078125, "reward_std": 0.22900179028511047, "rewards/accuracy_reward/mean": 0.11693548411130905, "rewards/accuracy_reward/std": 0.3216678202152252, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.39501953125, "rewards/tag_count_reward/std": 0.1657858043909073, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.60546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1761.95703125, "completions/mean_terminated_length": 1322.980224609375, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "entropy": 0.33952364325523376, "epoch": 0.026971662683509732, "frac_reward_zero_std": 0.09375, "grad_norm": 0.09937937541208347, "learning_rate": 2.6621160409556315e-07, "loss": 0.0877, "num_tokens": 74711000.0, "reward": 0.4228515625, "reward_std": 0.1958680897951126, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3818359375, "rewards/tag_count_reward/std": 0.17888037860393524, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.544921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1616.771484375, "completions/mean_terminated_length": 1100.40771484375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.31882456690073013, "epoch": 0.02731307613519973, "frac_reward_zero_std": 0.1875, "grad_norm": 0.0868334287945638, "learning_rate": 2.696245733788396e-07, "loss": 0.0823, "num_tokens": 75623123.0, "reward": 0.43896484375, "reward_std": 0.18612055480480194, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.38623046875, "rewards/tag_count_reward/std": 0.17238640785217285, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.435546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1505.046875, "completions/mean_terminated_length": 1086.0899658203125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.27675093710422516, "epoch": 0.027654489586889725, "frac_reward_zero_std": 0.09375, "grad_norm": 0.09449151428635437, "learning_rate": 2.73037542662116e-07, "loss": 0.0822, "num_tokens": 76465547.0, "reward": 0.58544921875, "reward_std": 0.24257883429527283, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.43310546875, "rewards/tag_count_reward/std": 0.17637556791305542, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1750.251953125, "completions/mean_terminated_length": 1254.0052490234375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.32397812604904175, "epoch": 0.02799590303857972, "frac_reward_zero_std": 0.09375, "grad_norm": 0.09278754938206935, "learning_rate": 2.764505119453925e-07, "loss": 0.0829, "num_tokens": 77441084.0, "reward": 0.4755859375, "reward_std": 0.25697848200798035, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3720703125, "rewards/tag_count_reward/std": 0.16769768297672272, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1699.5625, "completions/mean_terminated_length": 1173.490234375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.3514496088027954, "epoch": 0.028337316490269715, "frac_reward_zero_std": 0.0625, "grad_norm": 0.10348076867259319, "learning_rate": 2.798634812286689e-07, "loss": 0.1166, "num_tokens": 78385404.0, "reward": 0.4462890625, "reward_std": 0.21120589971542358, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3837890625, "rewards/tag_count_reward/std": 0.1821831464767456, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1649.166015625, "completions/mean_terminated_length": 1152.372802734375, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "entropy": 0.3302997201681137, "epoch": 0.02867872994195971, "frac_reward_zero_std": 0.09375, "grad_norm": 0.10159672198810173, "learning_rate": 2.8327645051194536e-07, "loss": 0.0748, "num_tokens": 79304417.0, "reward": 0.46044921875, "reward_std": 0.20142708718776703, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.39794921875, "rewards/tag_count_reward/std": 0.17264120280742645, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.615234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1730.0703125, "completions/mean_terminated_length": 1221.70556640625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.3137484937906265, "epoch": 0.02902014339364971, "frac_reward_zero_std": 0.0625, "grad_norm": 0.09506566464977444, "learning_rate": 2.8668941979522184e-07, "loss": 0.0862, "num_tokens": 80266053.0, "reward": 0.43896484375, "reward_std": 0.19225408136844635, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.38232421875, "rewards/tag_count_reward/std": 0.16756302118301392, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.587890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1689.578125, "completions/mean_terminated_length": 1178.27490234375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.3317873105406761, "epoch": 0.029361556845339705, "frac_reward_zero_std": 0.09375, "grad_norm": 0.09911213197591902, "learning_rate": 2.9010238907849827e-07, "loss": 0.0751, "num_tokens": 81216349.0, "reward": 0.466796875, "reward_std": 0.1838911473751068, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.376953125, "rewards/tag_count_reward/std": 0.16025325655937195, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1535.25, "completions/mean_terminated_length": 1053.5758056640625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.35992803424596786, "epoch": 0.0297029702970297, "frac_reward_zero_std": 0.125, "grad_norm": 0.10479512881437489, "learning_rate": 2.935153583617747e-07, "loss": 0.0969, "num_tokens": 82081165.0, "reward": 0.45654296875, "reward_std": 0.20561590790748596, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.39404296875, "rewards/tag_count_reward/std": 0.18473725020885468, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.51953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1677.837890625, "completions/mean_terminated_length": 1277.581298828125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.3488902375102043, "epoch": 0.030044383748719698, "frac_reward_zero_std": 0.125, "grad_norm": 0.09317746927066792, "learning_rate": 2.969283276450512e-07, "loss": 0.095, "num_tokens": 83027322.0, "reward": 0.48583984375, "reward_std": 0.23675750195980072, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.38818359375, "rewards/tag_count_reward/std": 0.17153623700141907, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1637.41015625, "completions/mean_terminated_length": 1200.33056640625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.3045988902449608, "epoch": 0.030385797200409695, "frac_reward_zero_std": 0.125, "grad_norm": 0.09705826406103248, "learning_rate": 3.0034129692832767e-07, "loss": 0.0918, "num_tokens": 83939324.0, "reward": 0.44970703125, "reward_std": 0.18865254521369934, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.40087890625, "rewards/tag_count_reward/std": 0.16936108469963074, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.560546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1625.236328125, "completions/mean_terminated_length": 1085.977783203125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 0.318245105445385, "epoch": 0.03072721065209969, "frac_reward_zero_std": 0.03125, "grad_norm": 0.09584058936610584, "learning_rate": 3.037542662116041e-07, "loss": 0.1015, "num_tokens": 84846501.0, "reward": 0.48779296875, "reward_std": 0.24454732239246368, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.40771484375, "rewards/tag_count_reward/std": 0.1963563710451126, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.451171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1553.033203125, "completions/mean_terminated_length": 1146.1387939453125, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "entropy": 0.3284667879343033, "epoch": 0.031068624103789688, "frac_reward_zero_std": 0.0625, "grad_norm": 0.10583479829718444, "learning_rate": 3.0716723549488053e-07, "loss": 0.0895, "num_tokens": 85716950.0, "reward": 0.51611328125, "reward_std": 0.25324946641921997, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.43017578125, "rewards/tag_count_reward/std": 0.18140722811222076, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1816.720703125, "completions/mean_terminated_length": 1375.1875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3151656910777092, "epoch": 0.031410037555479685, "frac_reward_zero_std": 0.0, "grad_norm": 0.0967047863499484, "learning_rate": 3.10580204778157e-07, "loss": 0.0952, "num_tokens": 86731735.0, "reward": 0.39208984375, "reward_std": 0.18847689032554626, "rewards/accuracy_reward/mean": 0.03750000149011612, "rewards/accuracy_reward/std": 0.19018177688121796, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.35693359375, "rewards/tag_count_reward/std": 0.1456000804901123, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.419921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1468.671875, "completions/mean_terminated_length": 1049.2928466796875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.3375259339809418, "epoch": 0.03175145100716968, "frac_reward_zero_std": 0.0, "grad_norm": 0.11253789345723109, "learning_rate": 3.1399317406143344e-07, "loss": 0.11, "num_tokens": 87552831.0, "reward": 0.54345703125, "reward_std": 0.24634727835655212, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.43603515625, "rewards/tag_count_reward/std": 0.1739809662103653, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.564453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1671.775390625, "completions/mean_terminated_length": 1184.201904296875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3285514563322067, "epoch": 0.03209286445885968, "frac_reward_zero_std": 0.125, "grad_norm": 0.0993017440866758, "learning_rate": 3.1740614334470987e-07, "loss": 0.0769, "num_tokens": 88486444.0, "reward": 0.46435546875, "reward_std": 0.20933231711387634, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.39599609375, "rewards/tag_count_reward/std": 0.18185439705848694, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1688.796875, "completions/mean_terminated_length": 1196.5555419921875, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "entropy": 0.32562054693698883, "epoch": 0.032434277910549675, "frac_reward_zero_std": 0.0, "grad_norm": 0.10068830028211528, "learning_rate": 3.2081911262798635e-07, "loss": 0.1044, "num_tokens": 89432852.0, "reward": 0.4638671875, "reward_std": 0.2555462121963501, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3837890625, "rewards/tag_count_reward/std": 0.1660255342721939, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1620.6796875, "completions/mean_terminated_length": 1151.3277587890625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.3544609248638153, "epoch": 0.03277569136223967, "frac_reward_zero_std": 0.09375, "grad_norm": 0.10040859657029937, "learning_rate": 3.242320819112628e-07, "loss": 0.1237, "num_tokens": 90332480.0, "reward": 0.48974609375, "reward_std": 0.2444038689136505, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.40576171875, "rewards/tag_count_reward/std": 0.18381424248218536, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.521484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1638.9609375, "completions/mean_terminated_length": 1193.1917724609375, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "entropy": 0.3420708477497101, "epoch": 0.03311710481392967, "frac_reward_zero_std": 0.0625, "grad_norm": 0.09666013058518023, "learning_rate": 3.276450511945392e-07, "loss": 0.1052, "num_tokens": 91247676.0, "reward": 0.5009765625, "reward_std": 0.22199657559394836, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4150390625, "rewards/tag_count_reward/std": 0.1853446513414383, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.619140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1709.580078125, "completions/mean_terminated_length": 1159.4307861328125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.34292516857385635, "epoch": 0.033458518265619665, "frac_reward_zero_std": 0.15625, "grad_norm": 0.0912474752316875, "learning_rate": 3.3105802047781565e-07, "loss": 0.0708, "num_tokens": 92203877.0, "reward": 0.4169921875, "reward_std": 0.178598091006279, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3701171875, "rewards/tag_count_reward/std": 0.15396134555339813, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1611.150390625, "completions/mean_terminated_length": 1100.2584228515625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.36559291183948517, "epoch": 0.03379993171730966, "frac_reward_zero_std": 0.09375, "grad_norm": 0.10001313567633252, "learning_rate": 3.3447098976109213e-07, "loss": 0.0986, "num_tokens": 93105666.0, "reward": 0.43212890625, "reward_std": 0.16994985938072205, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.16324250400066376, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.40478515625, "rewards/tag_count_reward/std": 0.18062067031860352, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.572265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1652.50390625, "completions/mean_terminated_length": 1123.369873046875, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "entropy": 0.31865906715393066, "epoch": 0.03414134516899966, "frac_reward_zero_std": 0.0625, "grad_norm": 0.09699599499273771, "learning_rate": 3.3788395904436856e-07, "loss": 0.0699, "num_tokens": 94028884.0, "reward": 0.46875, "reward_std": 0.18947193026542664, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.40234375, "rewards/tag_count_reward/std": 0.18567688763141632, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1740.103515625, "completions/mean_terminated_length": 1304.4010009765625, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "entropy": 0.35445644706487656, "epoch": 0.034482758620689655, "frac_reward_zero_std": 0.0, "grad_norm": 0.102377336669836, "learning_rate": 3.4129692832764504e-07, "loss": 0.0959, "num_tokens": 94991081.0, "reward": 0.43408203125, "reward_std": 0.18991538882255554, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.38525390625, "rewards/tag_count_reward/std": 0.15449030697345734, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1708.65625, "completions/mean_terminated_length": 1161.551025390625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.30837277323007584, "epoch": 0.03482417207237965, "frac_reward_zero_std": 0.09375, "grad_norm": 0.09554617933765469, "learning_rate": 3.447098976109215e-07, "loss": 0.1003, "num_tokens": 95943097.0, "reward": 0.4189453125, "reward_std": 0.1886759102344513, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3759765625, "rewards/tag_count_reward/std": 0.16698963940143585, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.48828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1563.322265625, "completions/mean_terminated_length": 1100.843505859375, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.35114211589097977, "epoch": 0.03516558552406965, "frac_reward_zero_std": 0.09375, "grad_norm": 0.10915765864406857, "learning_rate": 3.4812286689419796e-07, "loss": 0.0869, "num_tokens": 96821374.0, "reward": 0.4775390625, "reward_std": 0.18612171709537506, "rewards/accuracy_reward/mean": 0.04838709533214569, "rewards/accuracy_reward/std": 0.21479946374893188, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4306640625, "rewards/tag_count_reward/std": 0.20109082758426666, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1503.828125, "completions/mean_terminated_length": 1188.0740966796875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.31313805282115936, "epoch": 0.035506998975759645, "frac_reward_zero_std": 0.0, "grad_norm": 0.10433462741342017, "learning_rate": 3.515358361774744e-07, "loss": 0.1091, "num_tokens": 97672774.0, "reward": 0.6044921875, "reward_std": 0.2917139530181885, "rewards/accuracy_reward/mean": 0.17540322244167328, "rewards/accuracy_reward/std": 0.3806955814361572, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4345703125, "rewards/tag_count_reward/std": 0.17023125290870667, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.556640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1650.779296875, "completions/mean_terminated_length": 1152.0660400390625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.34535282105207443, "epoch": 0.03584841242744964, "frac_reward_zero_std": 0.125, "grad_norm": 0.10356461390279416, "learning_rate": 3.5494880546075087e-07, "loss": 0.097, "num_tokens": 98594357.0, "reward": 0.4462890625, "reward_std": 0.1728796362876892, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3994140625, "rewards/tag_count_reward/std": 0.17029859125614166, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.447265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1530.560546875, "completions/mean_terminated_length": 1111.8551025390625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 0.3101142570376396, "epoch": 0.03618982587913964, "frac_reward_zero_std": 0.03125, "grad_norm": 0.10608545366900422, "learning_rate": 3.583617747440273e-07, "loss": 0.1015, "num_tokens": 99461524.0, "reward": 0.53564453125, "reward_std": 0.23677986860275269, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.43798828125, "rewards/tag_count_reward/std": 0.20014001429080963, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.37890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1465.150390625, "completions/mean_terminated_length": 1109.575439453125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.3089887276291847, "epoch": 0.036531239330829635, "frac_reward_zero_std": 0.0625, "grad_norm": 0.10936916518810315, "learning_rate": 3.6177474402730373e-07, "loss": 0.0976, "num_tokens": 100288897.0, "reward": 0.55517578125, "reward_std": 0.2415963113307953, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.46533203125, "rewards/tag_count_reward/std": 0.2024659365415573, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1705.1640625, "completions/mean_terminated_length": 1250.127197265625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.32062363624572754, "epoch": 0.03687265278251963, "frac_reward_zero_std": 0.09375, "grad_norm": 0.08906839421466099, "learning_rate": 3.6518771331058016e-07, "loss": 0.0662, "num_tokens": 101243941.0, "reward": 0.45703125, "reward_std": 0.22079730033874512, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.396484375, "rewards/tag_count_reward/std": 0.17563790082931519, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.447265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1550.224609375, "completions/mean_terminated_length": 1147.43115234375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.34388235211372375, "epoch": 0.03721406623420963, "frac_reward_zero_std": 0.0625, "grad_norm": 0.10050230171616052, "learning_rate": 3.6860068259385664e-07, "loss": 0.0973, "num_tokens": 102109272.0, "reward": 0.54248046875, "reward_std": 0.23939131200313568, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.43115234375, "rewards/tag_count_reward/std": 0.18773873150348663, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.42578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1521.1640625, "completions/mean_terminated_length": 1130.5169677734375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2965521663427353, "epoch": 0.037555479685899625, "frac_reward_zero_std": 0.03125, "grad_norm": 0.10371922854727045, "learning_rate": 3.7201365187713307e-07, "loss": 0.0954, "num_tokens": 102963180.0, "reward": 0.556640625, "reward_std": 0.2666396498680115, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.447265625, "rewards/tag_count_reward/std": 0.19572821259498596, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.50390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1589.8359375, "completions/mean_terminated_length": 1124.4566650390625, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "entropy": 0.34783126413822174, "epoch": 0.03789689313758962, "frac_reward_zero_std": 0.0, "grad_norm": 0.1045141018221958, "learning_rate": 3.754266211604095e-07, "loss": 0.1122, "num_tokens": 103854056.0, "reward": 0.48779296875, "reward_std": 0.2536656856536865, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.42919921875, "rewards/tag_count_reward/std": 0.19280590116977692, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1496.39453125, "completions/mean_terminated_length": 1106.5933837890625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.32779789716005325, "epoch": 0.03823830658927962, "frac_reward_zero_std": 0.0, "grad_norm": 0.11003237639468398, "learning_rate": 3.78839590443686e-07, "loss": 0.1044, "num_tokens": 104695138.0, "reward": 0.5166015625, "reward_std": 0.24735555052757263, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4580078125, "rewards/tag_count_reward/std": 0.20676980912685394, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1467.478515625, "completions/mean_terminated_length": 1043.854736328125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.32212459295988083, "epoch": 0.038579720040969614, "frac_reward_zero_std": 0.0, "grad_norm": 0.10644127829838883, "learning_rate": 3.8225255972696247e-07, "loss": 0.129, "num_tokens": 105520711.0, "reward": 0.556640625, "reward_std": 0.23715651035308838, "rewards/accuracy_reward/mean": 0.09072580933570862, "rewards/accuracy_reward/std": 0.2875087857246399, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.46875, "rewards/tag_count_reward/std": 0.2184046357870102, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.32421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1375.111328125, "completions/mean_terminated_length": 1052.2802734375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.3305097669363022, "epoch": 0.03892113349265961, "frac_reward_zero_std": 0.03125, "grad_norm": 0.12041007860841063, "learning_rate": 3.856655290102389e-07, "loss": 0.097, "num_tokens": 106303648.0, "reward": 0.60302734375, "reward_std": 0.24754047393798828, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.49560546875, "rewards/tag_count_reward/std": 0.2094985693693161, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.490234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1606.32421875, "completions/mean_terminated_length": 1181.57080078125, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.3363782688975334, "epoch": 0.03926254694434961, "frac_reward_zero_std": 0.0, "grad_norm": 0.10318950635504888, "learning_rate": 3.8907849829351533e-07, "loss": 0.097, "num_tokens": 107194310.0, "reward": 0.57177734375, "reward_std": 0.3044593334197998, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.43115234375, "rewards/tag_count_reward/std": 0.19351330399513245, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.474609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1550.400390625, "completions/mean_terminated_length": 1100.8958740234375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.3012431710958481, "epoch": 0.039603960396039604, "frac_reward_zero_std": 0.03125, "grad_norm": 0.09877332698403518, "learning_rate": 3.924914675767918e-07, "loss": 0.1137, "num_tokens": 108058307.0, "reward": 0.4892578125, "reward_std": 0.2292594611644745, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4365234375, "rewards/tag_count_reward/std": 0.20059603452682495, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.302734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1303.173828125, "completions/mean_terminated_length": 979.7899169921875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 0.3630156069993973, "epoch": 0.0399453738477296, "frac_reward_zero_std": 0.0, "grad_norm": 0.12666285467684227, "learning_rate": 3.9590443686006824e-07, "loss": 0.1461, "num_tokens": 108798796.0, "reward": 0.5888671875, "reward_std": 0.26228705048561096, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5068359375, "rewards/tag_count_reward/std": 0.21319521963596344, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.333984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1424.533203125, "completions/mean_terminated_length": 1111.8856201171875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.36783356219530106, "epoch": 0.0402867872994196, "frac_reward_zero_std": 0.09375, "grad_norm": 0.12083448143671924, "learning_rate": 3.993174061433447e-07, "loss": 0.125, "num_tokens": 109607613.0, "reward": 0.54833984375, "reward_std": 0.22951078414916992, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.46435546875, "rewards/tag_count_reward/std": 0.1898190975189209, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.513671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1629.154296875, "completions/mean_terminated_length": 1186.759033203125, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.35083698481321335, "epoch": 0.040628200751109594, "frac_reward_zero_std": 0.0, "grad_norm": 0.1074914882420499, "learning_rate": 4.0273037542662116e-07, "loss": 0.1153, "num_tokens": 110527276.0, "reward": 0.54296875, "reward_std": 0.2826029062271118, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.427734375, "rewards/tag_count_reward/std": 0.1919422447681427, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1500.43359375, "completions/mean_terminated_length": 1113.4866943359375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.3433910831809044, "epoch": 0.04096961420279959, "frac_reward_zero_std": 0.0, "grad_norm": 0.11169342629054264, "learning_rate": 4.061433447098976e-07, "loss": 0.11, "num_tokens": 111366218.0, "reward": 0.56640625, "reward_std": 0.28910523653030396, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.45703125, "rewards/tag_count_reward/std": 0.19933690130710602, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.439453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1529.0390625, "completions/mean_terminated_length": 1122.1881103515625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.3600512593984604, "epoch": 0.04131102765448959, "frac_reward_zero_std": 0.03125, "grad_norm": 0.10765039148912138, "learning_rate": 4.09556313993174e-07, "loss": 0.1361, "num_tokens": 112225934.0, "reward": 0.53857421875, "reward_std": 0.24398504197597504, "rewards/accuracy_reward/mean": 0.09677419066429138, "rewards/accuracy_reward/std": 0.2959485352039337, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.44482421875, "rewards/tag_count_reward/std": 0.20213529467582703, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.404296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1505.3125, "completions/mean_terminated_length": 1136.9967041015625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.3359069302678108, "epoch": 0.041652441106179584, "frac_reward_zero_std": 0.0625, "grad_norm": 0.10369017129673871, "learning_rate": 4.1296928327645045e-07, "loss": 0.1279, "num_tokens": 113070286.0, "reward": 0.56201171875, "reward_std": 0.22775989770889282, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.44873046875, "rewards/tag_count_reward/std": 0.20316322147846222, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1421.48828125, "completions/mean_terminated_length": 1019.8782348632812, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.3178732320666313, "epoch": 0.04199385455786958, "frac_reward_zero_std": 0.03125, "grad_norm": 0.11084982289629901, "learning_rate": 4.1638225255972693e-07, "loss": 0.0947, "num_tokens": 113875944.0, "reward": 0.60302734375, "reward_std": 0.3072666823863983, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.48779296875, "rewards/tag_count_reward/std": 0.2397046685218811, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.333984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1309.484375, "completions/mean_terminated_length": 939.1436767578125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 0.33351579308509827, "epoch": 0.04233526800955958, "frac_reward_zero_std": 0.0, "grad_norm": 0.12729377142252357, "learning_rate": 4.1979522184300336e-07, "loss": 0.124, "num_tokens": 114619168.0, "reward": 0.65478515625, "reward_std": 0.2901787757873535, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.49658203125, "rewards/tag_count_reward/std": 0.22635336220264435, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.373046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1387.017578125, "completions/mean_terminated_length": 993.7227172851562, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.3622526749968529, "epoch": 0.042676681461249574, "frac_reward_zero_std": 0.0, "grad_norm": 0.11915614273256953, "learning_rate": 4.2320819112627985e-07, "loss": 0.1492, "num_tokens": 115416121.0, "reward": 0.5390625, "reward_std": 0.2666547894477844, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.46484375, "rewards/tag_count_reward/std": 0.20508308708667755, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.43359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1460.431640625, "completions/mean_terminated_length": 1010.637939453125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3430361747741699, "epoch": 0.04301809491293957, "frac_reward_zero_std": 0.0, "grad_norm": 0.11435951830762078, "learning_rate": 4.2662116040955633e-07, "loss": 0.1616, "num_tokens": 116244438.0, "reward": 0.56640625, "reward_std": 0.2739081382751465, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.486328125, "rewards/tag_count_reward/std": 0.24492618441581726, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.27734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1262.41796875, "completions/mean_terminated_length": 960.9243774414062, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.31982356309890747, "epoch": 0.04335950836462957, "frac_reward_zero_std": 0.0, "grad_norm": 0.11945898311644848, "learning_rate": 4.3003412969283276e-07, "loss": 0.133, "num_tokens": 116961468.0, "reward": 0.60546875, "reward_std": 0.25118792057037354, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.529296875, "rewards/tag_count_reward/std": 0.23589587211608887, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.384765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1427.201171875, "completions/mean_terminated_length": 1038.95556640625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.3225324749946594, "epoch": 0.043700921816319564, "frac_reward_zero_std": 0.03125, "grad_norm": 0.10980723048808023, "learning_rate": 4.334470989761092e-07, "loss": 0.1527, "num_tokens": 117773395.0, "reward": 0.5556640625, "reward_std": 0.26002854108810425, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4755859375, "rewards/tag_count_reward/std": 0.20957894623279572, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1362.375, "completions/mean_terminated_length": 1027.534912109375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.33832086622714996, "epoch": 0.04404233526800956, "frac_reward_zero_std": 0.0, "grad_norm": 0.12399891997045666, "learning_rate": 4.3686006825938567e-07, "loss": 0.1656, "num_tokens": 118551699.0, "reward": 0.6171875, "reward_std": 0.3028506338596344, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.49609375, "rewards/tag_count_reward/std": 0.22004352509975433, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.31640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1342.9921875, "completions/mean_terminated_length": 1016.6742553710938, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.29847314208745956, "epoch": 0.04438374871969956, "frac_reward_zero_std": 0.03125, "grad_norm": 0.11223083637528995, "learning_rate": 4.402730375426621e-07, "loss": 0.1113, "num_tokens": 119319135.0, "reward": 0.65576171875, "reward_std": 0.3268957734107971, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.51513671875, "rewards/tag_count_reward/std": 0.22695203125476837, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1553.3828125, "completions/mean_terminated_length": 1130.4493408203125, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "entropy": 0.32734616100788116, "epoch": 0.044725162171389554, "frac_reward_zero_std": 0.03125, "grad_norm": 0.10151175815286362, "learning_rate": 4.4368600682593853e-07, "loss": 0.1438, "num_tokens": 120193811.0, "reward": 0.54541015625, "reward_std": 0.290295273065567, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.45751953125, "rewards/tag_count_reward/std": 0.2277841418981552, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1511.564453125, "completions/mean_terminated_length": 1156.262939453125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.32982804626226425, "epoch": 0.04506657562307955, "frac_reward_zero_std": 0.03125, "grad_norm": 0.11475562782428342, "learning_rate": 4.4709897610921496e-07, "loss": 0.1448, "num_tokens": 121055380.0, "reward": 0.56640625, "reward_std": 0.2694958448410034, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.484375, "rewards/tag_count_reward/std": 0.22718821465969086, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.373046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1470.88671875, "completions/mean_terminated_length": 1127.4952392578125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.31374491751194, "epoch": 0.04540798907476955, "frac_reward_zero_std": 0.0, "grad_norm": 0.1120548054952305, "learning_rate": 4.5051194539249145e-07, "loss": 0.132, "num_tokens": 121882890.0, "reward": 0.56494140625, "reward_std": 0.25806570053100586, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.50830078125, "rewards/tag_count_reward/std": 0.22514279186725616, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1373.580078125, "completions/mean_terminated_length": 1020.3125, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.31678758561611176, "epoch": 0.045749402526459544, "frac_reward_zero_std": 0.0, "grad_norm": 0.11754376553746064, "learning_rate": 4.539249146757679e-07, "loss": 0.1769, "num_tokens": 122660163.0, "reward": 0.60595703125, "reward_std": 0.31131914258003235, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.52587890625, "rewards/tag_count_reward/std": 0.24963615834712982, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.337890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1347.4140625, "completions/mean_terminated_length": 989.8878784179688, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.3504220247268677, "epoch": 0.04609081597814954, "frac_reward_zero_std": 0.0, "grad_norm": 0.12011544618652266, "learning_rate": 4.573378839590443e-07, "loss": 0.1403, "num_tokens": 123425239.0, "reward": 0.662109375, "reward_std": 0.3445023000240326, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.51171875, "rewards/tag_count_reward/std": 0.2358715683221817, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.341796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1374.009765625, "completions/mean_terminated_length": 1024.0147705078125, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 0.3385768160223961, "epoch": 0.04643222942983954, "frac_reward_zero_std": 0.0, "grad_norm": 0.12257880156852057, "learning_rate": 4.6075085324232084e-07, "loss": 0.1404, "num_tokens": 124207340.0, "reward": 0.62255859375, "reward_std": 0.3055983781814575, "rewards/accuracy_reward/mean": 0.10685484111309052, "rewards/accuracy_reward/std": 0.30924052000045776, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.51904296875, "rewards/tag_count_reward/std": 0.23251719772815704, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.35546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1485.740234375, "completions/mean_terminated_length": 1175.6453857421875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.34221919625997543, "epoch": 0.046773642881529534, "frac_reward_zero_std": 0.0625, "grad_norm": 0.11066430794930575, "learning_rate": 4.641638225255973e-07, "loss": 0.1535, "num_tokens": 125054359.0, "reward": 0.5732421875, "reward_std": 0.266997754573822, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5009765625, "rewards/tag_count_reward/std": 0.24580436944961548, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.314453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1315.1796875, "completions/mean_terminated_length": 979.042724609375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.32410547137260437, "epoch": 0.04711505633321953, "frac_reward_zero_std": 0.0, "grad_norm": 0.12537234830365845, "learning_rate": 4.675767918088737e-07, "loss": 0.1448, "num_tokens": 125799091.0, "reward": 0.60693359375, "reward_std": 0.28191959857940674, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.55029296875, "rewards/tag_count_reward/std": 0.2589586675167084, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1399.013671875, "completions/mean_terminated_length": 969.1655883789062, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.30607832223176956, "epoch": 0.04745646978490953, "frac_reward_zero_std": 0.03125, "grad_norm": 0.11664671184036385, "learning_rate": 4.709897610921502e-07, "loss": 0.1666, "num_tokens": 126596810.0, "reward": 0.61181640625, "reward_std": 0.30411070585250854, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.53369140625, "rewards/tag_count_reward/std": 0.2611742317676544, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.31640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1320.734375, "completions/mean_terminated_length": 984.1142578125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.3020649701356888, "epoch": 0.047797883236599524, "frac_reward_zero_std": 0.03125, "grad_norm": 0.11765636449263031, "learning_rate": 4.744027303754266e-07, "loss": 0.1417, "num_tokens": 127344450.0, "reward": 0.61962890625, "reward_std": 0.2805081307888031, "rewards/accuracy_reward/mean": 0.08064515888690948, "rewards/accuracy_reward/std": 0.2725643217563629, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.54150390625, "rewards/tag_count_reward/std": 0.2455296665430069, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1256.705078125, "completions/mean_terminated_length": 934.9697875976562, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.2952331304550171, "epoch": 0.04813929668828952, "frac_reward_zero_std": 0.0, "grad_norm": 0.11512181994780676, "learning_rate": 4.778156996587031e-07, "loss": 0.1461, "num_tokens": 128065387.0, "reward": 0.71875, "reward_std": 0.3535493314266205, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.583984375, "rewards/tag_count_reward/std": 0.2691352665424347, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.298828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1312.244140625, "completions/mean_terminated_length": 998.6768798828125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.2821734696626663, "epoch": 0.04848071013997952, "frac_reward_zero_std": 0.0, "grad_norm": 0.12308040534151343, "learning_rate": 4.812286689419795e-07, "loss": 0.1509, "num_tokens": 128820536.0, "reward": 0.64404296875, "reward_std": 0.2922329902648926, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.56787109375, "rewards/tag_count_reward/std": 0.25249820947647095, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.271484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1232.794921875, "completions/mean_terminated_length": 929.00537109375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.27105090022087097, "epoch": 0.048822123591669514, "frac_reward_zero_std": 0.0, "grad_norm": 0.12138724818273303, "learning_rate": 4.84641638225256e-07, "loss": 0.1649, "num_tokens": 129524927.0, "reward": 0.6630859375, "reward_std": 0.2956351041793823, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5908203125, "rewards/tag_count_reward/std": 0.2641378343105316, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.259765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1251.40625, "completions/mean_terminated_length": 971.8628540039062, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.29423508048057556, "epoch": 0.04916353704335951, "frac_reward_zero_std": 0.03125, "grad_norm": 0.1152796320297533, "learning_rate": 4.880546075085323e-07, "loss": 0.1221, "num_tokens": 130245183.0, "reward": 0.68359375, "reward_std": 0.2908443808555603, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.58984375, "rewards/tag_count_reward/std": 0.2653955817222595, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.166015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 997.57421875, "completions/mean_terminated_length": 788.4730224609375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.3402106240391731, "epoch": 0.04950495049504951, "frac_reward_zero_std": 0.0, "grad_norm": 0.15456474532076053, "learning_rate": 4.914675767918088e-07, "loss": 0.1663, "num_tokens": 130826245.0, "reward": 0.76025390625, "reward_std": 0.32774782180786133, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.64892578125, "rewards/tag_count_reward/std": 0.2515389323234558, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.263671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1233.50390625, "completions/mean_terminated_length": 941.8408203125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2953769564628601, "epoch": 0.049846363946739504, "frac_reward_zero_std": 0.0, "grad_norm": 0.12087623675750918, "learning_rate": 4.948805460750853e-07, "loss": 0.1541, "num_tokens": 131531975.0, "reward": 0.68212890625, "reward_std": 0.2980971932411194, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.59619140625, "rewards/tag_count_reward/std": 0.2544133961200714, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1123.982421875, "completions/mean_terminated_length": 932.2052001953125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2861112803220749, "epoch": 0.0501877773984295, "frac_reward_zero_std": 0.0, "grad_norm": 0.13059083792673312, "learning_rate": 4.982935153583617e-07, "loss": 0.1605, "num_tokens": 132186574.0, "reward": 0.75732421875, "reward_std": 0.3120903968811035, "rewards/accuracy_reward/mean": 0.10685484111309052, "rewards/accuracy_reward/std": 0.30924052000045776, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.65380859375, "rewards/tag_count_reward/std": 0.2529670298099518, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.26171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1241.66015625, "completions/mean_terminated_length": 955.8147583007812, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.27033383399248123, "epoch": 0.0505291908501195, "frac_reward_zero_std": 0.0, "grad_norm": 0.12534907376764384, "learning_rate": 5.017064846416383e-07, "loss": 0.1531, "num_tokens": 132901056.0, "reward": 0.7255859375, "reward_std": 0.32850611209869385, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6259765625, "rewards/tag_count_reward/std": 0.2668014168739319, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.26953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1259.298828125, "completions/mean_terminated_length": 968.28076171875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 0.26602189987897873, "epoch": 0.050870604301809494, "frac_reward_zero_std": 0.0, "grad_norm": 0.1180412478196381, "learning_rate": 5.051194539249146e-07, "loss": 0.1574, "num_tokens": 133623897.0, "reward": 0.70556640625, "reward_std": 0.2943551242351532, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.62744140625, "rewards/tag_count_reward/std": 0.2579345405101776, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1086.265625, "completions/mean_terminated_length": 852.8349609375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.29260967671871185, "epoch": 0.05121201775349949, "frac_reward_zero_std": 0.0, "grad_norm": 0.13969399340314134, "learning_rate": 5.085324232081911e-07, "loss": 0.1877, "num_tokens": 134255265.0, "reward": 0.7490234375, "reward_std": 0.3114307224750519, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6611328125, "rewards/tag_count_reward/std": 0.2657248079776764, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.181640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1103.46484375, "completions/mean_terminated_length": 893.8186645507812, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.3098350688815117, "epoch": 0.05155343120518949, "frac_reward_zero_std": 0.03125, "grad_norm": 0.13518097319218644, "learning_rate": 5.119453924914675e-07, "loss": 0.1578, "num_tokens": 134901007.0, "reward": 0.70556640625, "reward_std": 0.2881527543067932, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.64697265625, "rewards/tag_count_reward/std": 0.26543471217155457, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1157.1640625, "completions/mean_terminated_length": 924.5812377929688, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.24349907413125038, "epoch": 0.051894844656879484, "frac_reward_zero_std": 0.0, "grad_norm": 0.12155731149860104, "learning_rate": 5.15358361774744e-07, "loss": 0.1902, "num_tokens": 135577811.0, "reward": 0.7939453125, "reward_std": 0.3214573562145233, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6728515625, "rewards/tag_count_reward/std": 0.25965744256973267, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.197265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1159.818359375, "completions/mean_terminated_length": 941.5547485351562, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.25101546198129654, "epoch": 0.05223625810856948, "frac_reward_zero_std": 0.0, "grad_norm": 0.11867080827238577, "learning_rate": 5.187713310580204e-07, "loss": 0.1556, "num_tokens": 136246198.0, "reward": 0.7548828125, "reward_std": 0.31041961908340454, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6748046875, "rewards/tag_count_reward/std": 0.2635003626346588, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.185546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1072.529296875, "completions/mean_terminated_length": 850.2998046875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.26617948710918427, "epoch": 0.05257767156025948, "frac_reward_zero_std": 0.0, "grad_norm": 0.12979264275200966, "learning_rate": 5.221843003412969e-07, "loss": 0.1542, "num_tokens": 136869493.0, "reward": 0.7431640625, "reward_std": 0.29914432764053345, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6904296875, "rewards/tag_count_reward/std": 0.25769224762916565, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.150390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1025.908203125, "completions/mean_terminated_length": 844.9862060546875, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.26833656057715416, "epoch": 0.052919085011949474, "frac_reward_zero_std": 0.0, "grad_norm": 0.12808999266392643, "learning_rate": 5.255972696245734e-07, "loss": 0.1798, "num_tokens": 137462086.0, "reward": 0.7900390625, "reward_std": 0.32535552978515625, "rewards/accuracy_reward/mean": 0.07459677755832672, "rewards/accuracy_reward/std": 0.263004869222641, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7177734375, "rewards/tag_count_reward/std": 0.25352028012275696, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.146484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 995.685546875, "completions/mean_terminated_length": 815.0823364257812, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.2596510425209999, "epoch": 0.05326049846363947, "frac_reward_zero_std": 0.0, "grad_norm": 0.13956354103587376, "learning_rate": 5.290102389078498e-07, "loss": 0.1741, "num_tokens": 138044885.0, "reward": 0.80517578125, "reward_std": 0.360893577337265, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.70751953125, "rewards/tag_count_reward/std": 0.2603570818901062, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 972.068359375, "completions/mean_terminated_length": 754.8615112304688, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.2801176831126213, "epoch": 0.05360191191532947, "frac_reward_zero_std": 0.0, "grad_norm": 0.14380496550834765, "learning_rate": 5.324232081911263e-07, "loss": 0.1928, "num_tokens": 138612424.0, "reward": 0.8232421875, "reward_std": 0.3431587219238281, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7001953125, "rewards/tag_count_reward/std": 0.25452837347984314, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 996.490234375, "completions/mean_terminated_length": 835.4482421875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2403605431318283, "epoch": 0.053943325367019464, "frac_reward_zero_std": 0.0, "grad_norm": 0.13119799065131732, "learning_rate": 5.358361774744027e-07, "loss": 0.206, "num_tokens": 139203091.0, "reward": 0.796875, "reward_std": 0.2816455662250519, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.744140625, "rewards/tag_count_reward/std": 0.24573633074760437, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.154296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 981.982421875, "completions/mean_terminated_length": 787.4896240234375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.25859130546450615, "epoch": 0.05428473881870946, "frac_reward_zero_std": 0.0, "grad_norm": 0.13981574775258823, "learning_rate": 5.392491467576792e-07, "loss": 0.2285, "num_tokens": 139786090.0, "reward": 0.79931640625, "reward_std": 0.31394490599632263, "rewards/accuracy_reward/mean": 0.07661290466785431, "rewards/accuracy_reward/std": 0.2662447690963745, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.72509765625, "rewards/tag_count_reward/std": 0.25314071774482727, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.138671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 927.853515625, "completions/mean_terminated_length": 747.5125122070312, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.2561090886592865, "epoch": 0.05462615227039946, "frac_reward_zero_std": 0.0, "grad_norm": 0.1409931332964145, "learning_rate": 5.426621160409555e-07, "loss": 0.1493, "num_tokens": 140338063.0, "reward": 0.8662109375, "reward_std": 0.366870254278183, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7255859375, "rewards/tag_count_reward/std": 0.2791510224342346, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.11328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 890.78125, "completions/mean_terminated_length": 742.9426879882812, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.2640209272503853, "epoch": 0.054967565722089454, "frac_reward_zero_std": 0.0, "grad_norm": 0.14407246990490946, "learning_rate": 5.46075085324232e-07, "loss": 0.1556, "num_tokens": 140875919.0, "reward": 0.84228515625, "reward_std": 0.3163100481033325, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.74267578125, "rewards/tag_count_reward/std": 0.26370471715927124, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.12109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1972.0, "completions/mean_length": 864.576171875, "completions/mean_terminated_length": 701.5266723632812, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.26322656869888306, "epoch": 0.05530897917377945, "frac_reward_zero_std": 0.0, "grad_norm": 0.1525216076157827, "learning_rate": 5.494880546075085e-07, "loss": 0.1791, "num_tokens": 141395254.0, "reward": 0.85009765625, "reward_std": 0.3136492669582367, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.74267578125, "rewards/tag_count_reward/std": 0.2518426477909088, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 869.1328125, "completions/mean_terminated_length": 766.5138549804688, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.2763616666197777, "epoch": 0.05565039262546944, "frac_reward_zero_std": 0.0, "grad_norm": 0.15255931568188863, "learning_rate": 5.52901023890785e-07, "loss": 0.1521, "num_tokens": 141916778.0, "reward": 0.8837890625, "reward_std": 0.32764044404029846, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7724609375, "rewards/tag_count_reward/std": 0.23870450258255005, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 997.45703125, "completions/mean_terminated_length": 836.5631103515625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.23183483257889748, "epoch": 0.05599180607715944, "frac_reward_zero_std": 0.0, "grad_norm": 0.1294683753956615, "learning_rate": 5.563139931740614e-07, "loss": 0.1572, "num_tokens": 142505412.0, "reward": 0.9130859375, "reward_std": 0.31266093254089355, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7685546875, "rewards/tag_count_reward/std": 0.2470916211605072, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.138671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 942.0234375, "completions/mean_terminated_length": 763.9637451171875, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.23731654509902, "epoch": 0.05633321952884943, "frac_reward_zero_std": 0.0, "grad_norm": 0.13167474507587498, "learning_rate": 5.597269624573379e-07, "loss": 0.1952, "num_tokens": 143063648.0, "reward": 0.85546875, "reward_std": 0.3024081885814667, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.765625, "rewards/tag_count_reward/std": 0.2550852596759796, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.14453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 886.458984375, "completions/mean_terminated_length": 690.2168579101562, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.22530686855316162, "epoch": 0.05667463298053943, "frac_reward_zero_std": 0.0, "grad_norm": 0.13852870660501074, "learning_rate": 5.631399317406143e-07, "loss": 0.1977, "num_tokens": 143594587.0, "reward": 0.853515625, "reward_std": 0.3252817988395691, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.759765625, "rewards/tag_count_reward/std": 0.2563333809375763, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 828.416015625, "completions/mean_terminated_length": 722.252685546875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.2637714296579361, "epoch": 0.05701604643222943, "frac_reward_zero_std": 0.0, "grad_norm": 0.14446943422907355, "learning_rate": 5.665529010238907e-07, "loss": 0.1468, "num_tokens": 144089344.0, "reward": 0.9326171875, "reward_std": 0.29945018887519836, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8115234375, "rewards/tag_count_reward/std": 0.2279922217130661, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 922.640625, "completions/mean_terminated_length": 750.288330078125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.25627732276916504, "epoch": 0.05735745988391942, "frac_reward_zero_std": 0.03125, "grad_norm": 0.13522935166356126, "learning_rate": 5.699658703071673e-07, "loss": 0.1883, "num_tokens": 144648344.0, "reward": 0.83154296875, "reward_std": 0.2996981143951416, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.76513671875, "rewards/tag_count_reward/std": 0.25149333477020264, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 737.66796875, "completions/mean_terminated_length": 650.3125610351562, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.2849300727248192, "epoch": 0.05769887333560942, "frac_reward_zero_std": 0.0, "grad_norm": 0.16908014093682394, "learning_rate": 5.733788395904437e-07, "loss": 0.1158, "num_tokens": 145099390.0, "reward": 0.9609375, "reward_std": 0.2940421402454376, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.837890625, "rewards/tag_count_reward/std": 0.20711380243301392, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 850.216796875, "completions/mean_terminated_length": 745.9512329101562, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.2614511325955391, "epoch": 0.05804028678729942, "frac_reward_zero_std": 0.0, "grad_norm": 0.1495324229036176, "learning_rate": 5.767918088737202e-07, "loss": 0.1905, "num_tokens": 145613581.0, "reward": 0.8779296875, "reward_std": 0.29240119457244873, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8251953125, "rewards/tag_count_reward/std": 0.23347531259059906, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 790.052734375, "completions/mean_terminated_length": 692.0652465820312, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.24923986196517944, "epoch": 0.05838170023898941, "frac_reward_zero_std": 0.0, "grad_norm": 0.1383850663785449, "learning_rate": 5.802047781569965e-07, "loss": 0.1456, "num_tokens": 146095592.0, "reward": 0.9345703125, "reward_std": 0.30553025007247925, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8388671875, "rewards/tag_count_reward/std": 0.2280760258436203, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.080078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 802.05078125, "completions/mean_terminated_length": 693.5924072265625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.2749634236097336, "epoch": 0.05872311369067941, "frac_reward_zero_std": 0.0, "grad_norm": 0.14886097203409815, "learning_rate": 5.83617747440273e-07, "loss": 0.1381, "num_tokens": 146591874.0, "reward": 0.939453125, "reward_std": 0.30092379450798035, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.21770349144935608, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.107421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 872.98046875, "completions/mean_terminated_length": 731.5667724609375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.2446579784154892, "epoch": 0.05906452714236941, "frac_reward_zero_std": 0.0, "grad_norm": 0.14299844789992375, "learning_rate": 5.870307167235494e-07, "loss": 0.1745, "num_tokens": 147114552.0, "reward": 0.89501953125, "reward_std": 0.2881016135215759, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.84033203125, "rewards/tag_count_reward/std": 0.23568463325500488, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 769.076171875, "completions/mean_terminated_length": 695.0888061523438, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.26702849566936493, "epoch": 0.0594059405940594, "frac_reward_zero_std": 0.0, "grad_norm": 0.16618186982411437, "learning_rate": 5.904436860068259e-07, "loss": 0.1264, "num_tokens": 147588335.0, "reward": 0.98583984375, "reward_std": 0.31476593017578125, "rewards/accuracy_reward/mean": 0.12903225421905518, "rewards/accuracy_reward/std": 0.33557409048080444, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.86083984375, "rewards/tag_count_reward/std": 0.21511994302272797, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 738.875, "completions/mean_terminated_length": 665.9959106445312, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.2493576668202877, "epoch": 0.0597473540457494, "frac_reward_zero_std": 0.0, "grad_norm": 0.14995960703569466, "learning_rate": 5.938566552901024e-07, "loss": 0.153, "num_tokens": 148044383.0, "reward": 0.97802734375, "reward_std": 0.2852245271205902, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.88037109375, "rewards/tag_count_reward/std": 0.20504988729953766, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 685.4453125, "completions/mean_terminated_length": 627.1690673828125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.301467627286911, "epoch": 0.060088767497439396, "frac_reward_zero_std": 0.0625, "grad_norm": 0.1675152976075882, "learning_rate": 5.972696245733788e-07, "loss": 0.158, "num_tokens": 148475187.0, "reward": 0.97119140625, "reward_std": 0.2383856475353241, "rewards/accuracy_reward/mean": 0.08064515888690948, "rewards/accuracy_reward/std": 0.2725643217563629, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.89306640625, "rewards/tag_count_reward/std": 0.19133830070495605, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.056640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 751.6953125, "completions/mean_terminated_length": 673.8633422851562, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.281361848115921, "epoch": 0.06043018094912939, "frac_reward_zero_std": 0.03125, "grad_norm": 0.15697497006538558, "learning_rate": 6.006825938566553e-07, "loss": 0.171, "num_tokens": 148937527.0, "reward": 0.970703125, "reward_std": 0.28352487087249756, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.884765625, "rewards/tag_count_reward/std": 0.2042894810438156, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 752.751953125, "completions/mean_terminated_length": 642.9851684570312, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.26648997515439987, "epoch": 0.06077159440081939, "frac_reward_zero_std": 0.0625, "grad_norm": 0.14372670904656554, "learning_rate": 6.040955631399317e-07, "loss": 0.21, "num_tokens": 149402072.0, "reward": 1.0107421875, "reward_std": 0.30383744835853577, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8837890625, "rewards/tag_count_reward/std": 0.22647839784622192, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 735.525390625, "completions/mean_terminated_length": 662.4598388671875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.2499564066529274, "epoch": 0.061113007852509386, "frac_reward_zero_std": 0.0, "grad_norm": 0.1623486010669528, "learning_rate": 6.075085324232082e-07, "loss": 0.2084, "num_tokens": 149852789.0, "reward": 0.98779296875, "reward_std": 0.24029898643493652, "rewards/accuracy_reward/mean": 0.08669354766607285, "rewards/accuracy_reward/std": 0.281669557094574, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.90380859375, "rewards/tag_count_reward/std": 0.1944688856601715, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.072265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 766.787109375, "completions/mean_terminated_length": 666.9873657226562, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.27434535324573517, "epoch": 0.06145442130419938, "frac_reward_zero_std": 0.0, "grad_norm": 0.17102590120273148, "learning_rate": 6.109215017064846e-07, "loss": 0.1815, "num_tokens": 150334472.0, "reward": 0.93115234375, "reward_std": 0.2600076198577881, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.88818359375, "rewards/tag_count_reward/std": 0.2140423059463501, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.052734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 636.486328125, "completions/mean_terminated_length": 557.9072265625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.2891497537493706, "epoch": 0.06179583475588938, "frac_reward_zero_std": 0.0625, "grad_norm": 0.17335044108412356, "learning_rate": 6.143344709897611e-07, "loss": 0.201, "num_tokens": 150742001.0, "reward": 1.0068359375, "reward_std": 0.24724730849266052, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9169921875, "rewards/tag_count_reward/std": 0.18948465585708618, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.099609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 793.525390625, "completions/mean_terminated_length": 654.7440185546875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.2533347047865391, "epoch": 0.062137248207579376, "frac_reward_zero_std": 0.125, "grad_norm": 0.1391868159122194, "learning_rate": 6.177474402730375e-07, "loss": 0.2507, "num_tokens": 151226446.0, "reward": 0.935546875, "reward_std": 0.2545580267906189, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.235612154006958, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 594.234375, "completions/mean_terminated_length": 547.3386840820312, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.303137831389904, "epoch": 0.06247866165926937, "frac_reward_zero_std": 0.03125, "grad_norm": 0.20304398968144646, "learning_rate": 6.21160409556314e-07, "loss": 0.2256, "num_tokens": 151607046.0, "reward": 0.99755859375, "reward_std": 0.23310969769954681, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.92919921875, "rewards/tag_count_reward/std": 0.17202018201351166, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 704.734375, "completions/mean_terminated_length": 609.1882934570312, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.26044318825006485, "epoch": 0.06282007511095937, "frac_reward_zero_std": 0.03125, "grad_norm": 0.16443179201878766, "learning_rate": 6.245733788395904e-07, "loss": 0.2301, "num_tokens": 152043854.0, "reward": 1.00048828125, "reward_std": 0.23889106512069702, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91455078125, "rewards/tag_count_reward/std": 0.20307853817939758, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 668.91015625, "completions/mean_terminated_length": 630.1405639648438, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.2792425900697708, "epoch": 0.06316148856264937, "frac_reward_zero_std": 0.0625, "grad_norm": 0.15800331198233872, "learning_rate": 6.279863481228669e-07, "loss": 0.1348, "num_tokens": 152465840.0, "reward": 1.00439453125, "reward_std": 0.24389448761940002, "rewards/accuracy_reward/mean": 0.08467742055654526, "rewards/accuracy_reward/std": 0.278682142496109, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.92236328125, "rewards/tag_count_reward/std": 0.1855938732624054, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 687.029296875, "completions/mean_terminated_length": 593.3006591796875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.2740868851542473, "epoch": 0.06350290201433936, "frac_reward_zero_std": 0.0625, "grad_norm": 0.15331925380486364, "learning_rate": 6.313993174061433e-07, "loss": 0.126, "num_tokens": 152898255.0, "reward": 1.025390625, "reward_std": 0.2573901414871216, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2005603015422821, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 552.875, "completions/mean_terminated_length": 507.75048828125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 0.2923392653465271, "epoch": 0.06384431546602937, "frac_reward_zero_std": 0.09375, "grad_norm": 0.1785344455418854, "learning_rate": 6.348122866894197e-07, "loss": 0.1311, "num_tokens": 153256271.0, "reward": 1.07861328125, "reward_std": 0.26721978187561035, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95947265625, "rewards/tag_count_reward/std": 0.1422942876815796, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1947.0, "completions/mean_length": 622.689453125, "completions/mean_terminated_length": 582.6204833984375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2859421893954277, "epoch": 0.06418572891771936, "frac_reward_zero_std": 0.1875, "grad_norm": 0.17235173723662287, "learning_rate": 6.382252559726961e-07, "loss": 0.1494, "num_tokens": 153652960.0, "reward": 0.9912109375, "reward_std": 0.18125146627426147, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9501953125, "rewards/tag_count_reward/std": 0.15470431745052338, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1501.0, "completions/mean_length": 541.73828125, "completions/mean_terminated_length": 467.6598205566406, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.3047877475619316, "epoch": 0.06452714236940936, "frac_reward_zero_std": 0.125, "grad_norm": 0.19481242057499318, "learning_rate": 6.416382252559727e-07, "loss": 0.2581, "num_tokens": 154006858.0, "reward": 1.02490234375, "reward_std": 0.24941487610340118, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94091796875, "rewards/tag_count_reward/std": 0.17708925902843475, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 569.951171875, "completions/mean_terminated_length": 497.2602233886719, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.27323341369628906, "epoch": 0.06486855582109935, "frac_reward_zero_std": 0.0625, "grad_norm": 0.18744260772731702, "learning_rate": 6.450511945392492e-07, "loss": 0.3143, "num_tokens": 154371169.0, "reward": 1.00927734375, "reward_std": 0.24426379799842834, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94287109375, "rewards/tag_count_reward/std": 0.17284587025642395, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1981.0, "completions/mean_length": 585.44140625, "completions/mean_terminated_length": 525.98779296875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.2860838919878006, "epoch": 0.06520996927278935, "frac_reward_zero_std": 0.25, "grad_norm": 0.1541590702034284, "learning_rate": 6.484641638225256e-07, "loss": 0.1271, "num_tokens": 154752531.0, "reward": 1.0654296875, "reward_std": 0.2420130968093872, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9580078125, "rewards/tag_count_reward/std": 0.153065025806427, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 604.392578125, "completions/mean_terminated_length": 566.7835693359375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.2678963840007782, "epoch": 0.06555138272447934, "frac_reward_zero_std": 0.15625, "grad_norm": 0.1702216613686804, "learning_rate": 6.51877133105802e-07, "loss": 0.1825, "num_tokens": 155137356.0, "reward": 0.9970703125, "reward_std": 0.20263788104057312, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9501953125, "rewards/tag_count_reward/std": 0.15627752244472504, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 580.517578125, "completions/mean_terminated_length": 508.3462829589844, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.2795378342270851, "epoch": 0.06589279617616935, "frac_reward_zero_std": 0.15625, "grad_norm": 0.18653376064748622, "learning_rate": 6.552901023890784e-07, "loss": 0.2012, "num_tokens": 155518325.0, "reward": 1.01123046875, "reward_std": 0.2216024547815323, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94287109375, "rewards/tag_count_reward/std": 0.16999182105064392, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1958.0, "completions/mean_length": 517.48828125, "completions/mean_terminated_length": 471.2957458496094, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3144439160823822, "epoch": 0.06623420962785934, "frac_reward_zero_std": 0.40625, "grad_norm": 0.1566380389608203, "learning_rate": 6.587030716723549e-07, "loss": 0.1624, "num_tokens": 155861391.0, "reward": 1.048828125, "reward_std": 0.16765576601028442, "rewards/accuracy_reward/mean": 0.07661290466785431, "rewards/accuracy_reward/std": 0.2662447690963745, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.974609375, "rewards/tag_count_reward/std": 0.12449444830417633, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1958.0, "completions/mean_length": 540.146484375, "completions/mean_terminated_length": 465.9897155761719, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.2924729809165001, "epoch": 0.06657562307954934, "frac_reward_zero_std": 0.375, "grad_norm": 0.15200972978143296, "learning_rate": 6.621160409556313e-07, "loss": 0.2067, "num_tokens": 156211386.0, "reward": 1.03271484375, "reward_std": 0.20594261586666107, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95654296875, "rewards/tag_count_reward/std": 0.16682003438472748, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1863.0, "completions/mean_length": 508.73046875, "completions/mean_terminated_length": 468.6292724609375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.30329393595457077, "epoch": 0.06691703653123933, "frac_reward_zero_std": 0.3125, "grad_norm": 0.16381176239744968, "learning_rate": 6.655290102389079e-07, "loss": 0.0971, "num_tokens": 156547904.0, "reward": 1.0126953125, "reward_std": 0.16126835346221924, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9619140625, "rewards/tag_count_reward/std": 0.14167936146259308, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 545.060546875, "completions/mean_terminated_length": 496.57861328125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.2834962382912636, "epoch": 0.06725844998292933, "frac_reward_zero_std": 0.21875, "grad_norm": 0.1637884532919705, "learning_rate": 6.689419795221843e-07, "loss": 0.2028, "num_tokens": 156902175.0, "reward": 1.05126953125, "reward_std": 0.2083422839641571, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96728515625, "rewards/tag_count_reward/std": 0.14086346328258514, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 444.9921875, "completions/mean_terminated_length": 416.31011962890625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.29707564413547516, "epoch": 0.06759986343461932, "frac_reward_zero_std": 0.375, "grad_norm": 0.16611887074292467, "learning_rate": 6.723549488054607e-07, "loss": 0.1564, "num_tokens": 157201339.0, "reward": 1.05419921875, "reward_std": 0.16793791949748993, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98193359375, "rewards/tag_count_reward/std": 0.10393797606229782, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 498.69140625, "completions/mean_terminated_length": 464.6746520996094, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.29612472653388977, "epoch": 0.06794127688630933, "frac_reward_zero_std": 0.3125, "grad_norm": 0.1686444342259643, "learning_rate": 6.757679180887371e-07, "loss": 0.0917, "num_tokens": 157533981.0, "reward": 1.0712890625, "reward_std": 0.20079877972602844, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9716796875, "rewards/tag_count_reward/std": 0.11881966143846512, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1902.0, "completions/mean_length": 504.21484375, "completions/mean_terminated_length": 460.81524658203125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.26618726551532745, "epoch": 0.06828269033799932, "frac_reward_zero_std": 0.28125, "grad_norm": 0.1637203594313785, "learning_rate": 6.791808873720136e-07, "loss": 0.2088, "num_tokens": 157859851.0, "reward": 1.0224609375, "reward_std": 0.16755890846252441, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9677734375, "rewards/tag_count_reward/std": 0.13339802622795105, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1732.0, "completions/mean_length": 533.97265625, "completions/mean_terminated_length": 485.133056640625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 0.2728833109140396, "epoch": 0.06862410378968932, "frac_reward_zero_std": 0.40625, "grad_norm": 0.13421979042194626, "learning_rate": 6.825938566552901e-07, "loss": 0.093, "num_tokens": 158210765.0, "reward": 1.00048828125, "reward_std": 0.14752846956253052, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.1843547374010086, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96533203125, "rewards/tag_count_reward/std": 0.14212632179260254, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1781.0, "completions/mean_length": 437.2734375, "completions/mean_terminated_length": 408.4532470703125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.3169817626476288, "epoch": 0.06896551724137931, "frac_reward_zero_std": 0.3125, "grad_norm": 0.2022016815137556, "learning_rate": 6.860068259385665e-07, "loss": 0.1565, "num_tokens": 158518793.0, "reward": 1.0986328125, "reward_std": 0.20851677656173706, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9814453125, "rewards/tag_count_reward/std": 0.10789509862661362, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.041015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1939.0, "completions/mean_length": 477.98828125, "completions/mean_terminated_length": 410.839111328125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.322284959256649, "epoch": 0.06930693069306931, "frac_reward_zero_std": 0.34375, "grad_norm": 0.17227198507590266, "learning_rate": 6.89419795221843e-07, "loss": 0.0985, "num_tokens": 158841891.0, "reward": 1.0400390625, "reward_std": 0.194302499294281, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9658203125, "rewards/tag_count_reward/std": 0.140949085354805, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1820.0, "completions/mean_length": 369.525390625, "completions/mean_terminated_length": 359.63262939453125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.3621744066476822, "epoch": 0.0696483441447593, "frac_reward_zero_std": 0.40625, "grad_norm": 0.17713556634711183, "learning_rate": 6.928327645051194e-07, "loss": 0.0596, "num_tokens": 159103360.0, "reward": 1.1044921875, "reward_std": 0.18615375459194183, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9892578125, "rewards/tag_count_reward/std": 0.07745862752199173, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1830.0, "completions/mean_length": 450.76171875, "completions/mean_terminated_length": 418.9442443847656, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.2994356155395508, "epoch": 0.0699897575964493, "frac_reward_zero_std": 0.375, "grad_norm": 0.16653145529355712, "learning_rate": 6.962457337883959e-07, "loss": 0.1276, "num_tokens": 159407542.0, "reward": 1.02734375, "reward_std": 0.16509567201137543, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.974609375, "rewards/tag_count_reward/std": 0.12151136994361877, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1799.0, "completions/mean_length": 450.822265625, "completions/mean_terminated_length": 428.68316650390625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.3224856033921242, "epoch": 0.0703311710481393, "frac_reward_zero_std": 0.5, "grad_norm": 0.1840115417136169, "learning_rate": 6.996587030716723e-07, "loss": 0.1007, "num_tokens": 159723611.0, "reward": 1.06201171875, "reward_std": 0.14643040299415588, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98193359375, "rewards/tag_count_reward/std": 0.10275448858737946, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1762.0, "completions/mean_length": 424.154296875, "completions/mean_terminated_length": 391.8067932128906, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.33179257065057755, "epoch": 0.0706725844998293, "frac_reward_zero_std": 0.5, "grad_norm": 0.16046802088456938, "learning_rate": 7.030716723549488e-07, "loss": 0.1562, "num_tokens": 160016170.0, "reward": 1.041015625, "reward_std": 0.13989919424057007, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.001953125, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.982421875, "rewards/tag_count_reward/std": 0.10805881023406982, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1906.0, "completions/mean_length": 398.203125, "completions/mean_terminated_length": 385.21258544921875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.2983885854482651, "epoch": 0.07101399795151929, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11423702884607163, "learning_rate": 7.064846416382251e-07, "loss": 0.0264, "num_tokens": 160292834.0, "reward": 1.11767578125, "reward_std": 0.11895354837179184, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.0704338550567627, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1556.0, "completions/mean_length": 395.326171875, "completions/mean_terminated_length": 388.8451232910156, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.35037388652563095, "epoch": 0.07135541140320929, "frac_reward_zero_std": 0.59375, "grad_norm": 0.15187744196144665, "learning_rate": 7.098976109215017e-07, "loss": 0.0352, "num_tokens": 160569881.0, "reward": 1.10888671875, "reward_std": 0.1451425403356552, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05285605043172836, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1916.0, "completions/mean_length": 387.115234375, "completions/mean_terminated_length": 364.09307861328125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.3483133167028427, "epoch": 0.07169682485489928, "frac_reward_zero_std": 0.59375, "grad_norm": 0.15753955881274648, "learning_rate": 7.133105802047781e-07, "loss": 0.1163, "num_tokens": 160837348.0, "reward": 1.00732421875, "reward_std": 0.09515057504177094, "rewards/accuracy_reward/mean": 0.02217741869390011, "rewards/accuracy_reward/std": 0.14740893244743347, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98583984375, "rewards/tag_count_reward/std": 0.09342199563980103, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 365.193359375, "completions/mean_terminated_length": 341.8673400878906, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.35006116330623627, "epoch": 0.07203823830658929, "frac_reward_zero_std": 0.4375, "grad_norm": 0.17948600900582126, "learning_rate": 7.167235494880546e-07, "loss": 0.1058, "num_tokens": 161101367.0, "reward": 1.08056640625, "reward_std": 0.19318073987960815, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98681640625, "rewards/tag_count_reward/std": 0.08955762535333633, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1601.0, "completions/mean_length": 415.078125, "completions/mean_terminated_length": 392.4435729980469, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.30095720291137695, "epoch": 0.07237965175827928, "frac_reward_zero_std": 0.375, "grad_norm": 0.17967016108940764, "learning_rate": 7.201365187713311e-07, "loss": 0.1368, "num_tokens": 161395903.0, "reward": 1.0595703125, "reward_std": 0.17141230404376984, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9833984375, "rewards/tag_count_reward/std": 0.10359380394220352, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1769.0, "completions/mean_length": 457.15234375, "completions/mean_terminated_length": 415.7074279785156, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.31281379610300064, "epoch": 0.07272106520996928, "frac_reward_zero_std": 0.21875, "grad_norm": 0.19251552688886178, "learning_rate": 7.235494880546075e-07, "loss": 0.2215, "num_tokens": 161717005.0, "reward": 1.0546875, "reward_std": 0.20354260504245758, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.974609375, "rewards/tag_count_reward/std": 0.12350809574127197, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1230.0, "completions/mean_length": 396.115234375, "completions/mean_terminated_length": 332.4847717285156, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.3682389408349991, "epoch": 0.07306247866165927, "frac_reward_zero_std": 0.625, "grad_norm": 0.16712710867646163, "learning_rate": 7.269624573378839e-07, "loss": 0.025, "num_tokens": 161991880.0, "reward": 0.98974609375, "reward_std": 0.08673419803380966, "rewards/accuracy_reward/mean": 0.02016128972172737, "rewards/accuracy_reward/std": 0.14069372415542603, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97021484375, "rewards/tag_count_reward/std": 0.14064623415470123, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1567.0, "completions/mean_length": 396.009765625, "completions/mean_terminated_length": 383.001953125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.34487368911504745, "epoch": 0.07340389211334927, "frac_reward_zero_std": 0.59375, "grad_norm": 0.15412743502973308, "learning_rate": 7.303754266211603e-07, "loss": 0.114, "num_tokens": 162274029.0, "reward": 1.0634765625, "reward_std": 0.1252482831478119, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.07282997667789459, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1907.0, "completions/mean_length": 421.240234375, "completions/mean_terminated_length": 395.4186706542969, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.2899750843644142, "epoch": 0.07374530556503926, "frac_reward_zero_std": 0.375, "grad_norm": 0.16810378057812916, "learning_rate": 7.337883959044369e-07, "loss": 0.1509, "num_tokens": 162565080.0, "reward": 1.07568359375, "reward_std": 0.18480655550956726, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98193359375, "rewards/tag_count_reward/std": 0.1074102371931076, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1398.0, "completions/mean_length": 412.654296875, "completions/mean_terminated_length": 380.07769775390625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.29705727100372314, "epoch": 0.07408671901672925, "frac_reward_zero_std": 0.3125, "grad_norm": 0.1757900299666601, "learning_rate": 7.372013651877133e-07, "loss": 0.137, "num_tokens": 162852631.0, "reward": 1.08251953125, "reward_std": 0.20132778584957123, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97900390625, "rewards/tag_count_reward/std": 0.1124517023563385, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1918.0, "completions/mean_length": 386.3671875, "completions/mean_terminated_length": 369.98028564453125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.3411659374833107, "epoch": 0.07442813246841926, "frac_reward_zero_std": 0.65625, "grad_norm": 0.14968655908258868, "learning_rate": 7.406143344709898e-07, "loss": 0.0689, "num_tokens": 163124867.0, "reward": 1.0556640625, "reward_std": 0.11876828968524933, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.07770495861768723, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1941.0, "completions/mean_length": 433.6875, "completions/mean_terminated_length": 401.5299072265625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.32052892446517944, "epoch": 0.07476954592010925, "frac_reward_zero_std": 0.5625, "grad_norm": 0.14606200396417035, "learning_rate": 7.440273037542661e-07, "loss": 0.0936, "num_tokens": 163437523.0, "reward": 1.05615234375, "reward_std": 0.14170771837234497, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98193359375, "rewards/tag_count_reward/std": 0.10626542568206787, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1529.0, "completions/mean_length": 386.76953125, "completions/mean_terminated_length": 370.3865966796875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.3191162422299385, "epoch": 0.07511095937179925, "frac_reward_zero_std": 0.59375, "grad_norm": 0.15429892264564174, "learning_rate": 7.474402730375426e-07, "loss": 0.116, "num_tokens": 163716701.0, "reward": 1.06298828125, "reward_std": 0.11562823504209518, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99072265625, "rewards/tag_count_reward/std": 0.07997593283653259, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 388.39453125, "completions/mean_terminated_length": 368.7154235839844, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.311799593269825, "epoch": 0.07545237282348924, "frac_reward_zero_std": 0.53125, "grad_norm": 0.15582762317956428, "learning_rate": 7.50853242320819e-07, "loss": 0.0581, "num_tokens": 163988279.0, "reward": 1.1123046875, "reward_std": 0.17180243134498596, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9873046875, "rewards/tag_count_reward/std": 0.09297728538513184, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1777.0, "completions/mean_length": 384.203125, "completions/mean_terminated_length": 364.4743347167969, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.317944698035717, "epoch": 0.07579378627517924, "frac_reward_zero_std": 0.5, "grad_norm": 0.16226009980912448, "learning_rate": 7.542662116040955e-07, "loss": 0.0994, "num_tokens": 164263679.0, "reward": 1.0947265625, "reward_std": 0.1568537950515747, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9873046875, "rewards/tag_count_reward/std": 0.08755742013454437, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1281.0, "completions/mean_length": 393.234375, "completions/mean_terminated_length": 356.9021911621094, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.292220376431942, "epoch": 0.07613519972686923, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1662781562591962, "learning_rate": 7.57679180887372e-07, "loss": 0.1276, "num_tokens": 164543575.0, "reward": 1.10107421875, "reward_std": 0.20247295498847961, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98193359375, "rewards/tag_count_reward/std": 0.11187238991260529, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1197.0, "completions/mean_length": 360.517578125, "completions/mean_terminated_length": 340.5079040527344, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.3154038041830063, "epoch": 0.07647661317855924, "frac_reward_zero_std": 0.6875, "grad_norm": 0.13331479988966427, "learning_rate": 7.610921501706485e-07, "loss": 0.0861, "num_tokens": 164805264.0, "reward": 1.0537109375, "reward_std": 0.08614550530910492, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9873046875, "rewards/tag_count_reward/std": 0.08471756428480148, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1538.0, "completions/mean_length": 381.791015625, "completions/mean_terminated_length": 351.9781188964844, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.3313584104180336, "epoch": 0.07681802663024923, "frac_reward_zero_std": 0.53125, "grad_norm": 0.14176039929487488, "learning_rate": 7.645051194539249e-07, "loss": 0.0641, "num_tokens": 165081653.0, "reward": 1.091796875, "reward_std": 0.13753542304039001, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.986328125, "rewards/tag_count_reward/std": 0.09283844381570816, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1572.0, "completions/mean_length": 364.666015625, "completions/mean_terminated_length": 331.13348388671875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.3363476023077965, "epoch": 0.07715944008193923, "frac_reward_zero_std": 0.40625, "grad_norm": 0.19297302970480384, "learning_rate": 7.679180887372013e-07, "loss": 0.1539, "num_tokens": 165351866.0, "reward": 1.09716796875, "reward_std": 0.2020585834980011, "rewards/accuracy_reward/mean": 0.12096773833036423, "rewards/accuracy_reward/std": 0.32641899585723877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97998046875, "rewards/tag_count_reward/std": 0.11371075361967087, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1430.0, "completions/mean_length": 358.408203125, "completions/mean_terminated_length": 341.7455749511719, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.34561586380004883, "epoch": 0.07750085353362922, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1668750811817238, "learning_rate": 7.713310580204778e-07, "loss": 0.0906, "num_tokens": 165614539.0, "reward": 1.08935546875, "reward_std": 0.13752159476280212, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99169921875, "rewards/tag_count_reward/std": 0.07204344123601913, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1683.0, "completions/mean_length": 354.30078125, "completions/mean_terminated_length": 347.6588439941406, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.3096732720732689, "epoch": 0.07784226698531922, "frac_reward_zero_std": 0.5, "grad_norm": 0.1665532014190424, "learning_rate": 7.747440273037542e-07, "loss": 0.0496, "num_tokens": 165867141.0, "reward": 1.09619140625, "reward_std": 0.1712796539068222, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 367.380859375, "completions/mean_terminated_length": 344.08514404296875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.30980801582336426, "epoch": 0.07818368043700921, "frac_reward_zero_std": 0.34375, "grad_norm": 0.18093625690219878, "learning_rate": 7.781569965870307e-07, "loss": 0.1228, "num_tokens": 166139864.0, "reward": 1.09912109375, "reward_std": 0.2005949765443802, "rewards/accuracy_reward/mean": 0.11491935700178146, "rewards/accuracy_reward/std": 0.3192465901374817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98779296875, "rewards/tag_count_reward/std": 0.09104960411787033, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 387.28515625, "completions/mean_terminated_length": 374.2086486816406, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.3432101905345917, "epoch": 0.07852509388869922, "frac_reward_zero_std": 0.71875, "grad_norm": 0.12063613114505621, "learning_rate": 7.815699658703071e-07, "loss": 0.0386, "num_tokens": 166422058.0, "reward": 1.02197265625, "reward_std": 0.07415038347244263, "rewards/accuracy_reward/mean": 0.030241934582591057, "rewards/accuracy_reward/std": 0.1714252382516861, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.0704338550567627, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1670.0, "completions/mean_length": 346.712890625, "completions/mean_terminated_length": 343.383544921875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 0.35184088349342346, "epoch": 0.0788665073403892, "frac_reward_zero_std": 0.5625, "grad_norm": 0.16913212657884766, "learning_rate": 7.849829351535836e-07, "loss": -0.006, "num_tokens": 166675927.0, "reward": 1.0517578125, "reward_std": 0.12607382237911224, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 359.126953125, "completions/mean_terminated_length": 349.17291259765625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.3253704160451889, "epoch": 0.07920792079207921, "frac_reward_zero_std": 0.625, "grad_norm": 0.15273998643346698, "learning_rate": 7.8839590443686e-07, "loss": 0.0459, "num_tokens": 166939624.0, "reward": 1.037109375, "reward_std": 0.08311062306165695, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.060289934277534485, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1572.0, "completions/mean_length": 333.306640625, "completions/mean_terminated_length": 323.2004089355469, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.37531305104494095, "epoch": 0.0795493342437692, "frac_reward_zero_std": 0.59375, "grad_norm": 0.20343062640558895, "learning_rate": 7.918088737201365e-07, "loss": 0.0647, "num_tokens": 167186101.0, "reward": 1.06591796875, "reward_std": 0.14534950256347656, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1310.0, "completions/mean_length": 336.701171875, "completions/mean_terminated_length": 326.61492919921875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.32921192049980164, "epoch": 0.0798907476954592, "frac_reward_zero_std": 0.625, "grad_norm": 0.13899416468476905, "learning_rate": 7.952218430034129e-07, "loss": 0.0671, "num_tokens": 167439372.0, "reward": 1.05517578125, "reward_std": 0.10680990666151047, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.059313252568244934, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1383.0, "completions/mean_length": 397.29296875, "completions/mean_terminated_length": 364.4103698730469, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.2958144471049309, "epoch": 0.08023216114714919, "frac_reward_zero_std": 0.625, "grad_norm": 0.1265058277588042, "learning_rate": 7.986348122866893e-07, "loss": 0.1002, "num_tokens": 167715458.0, "reward": 1.06494140625, "reward_std": 0.1417403668165207, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98291015625, "rewards/tag_count_reward/std": 0.1064271554350853, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1651.0, "completions/mean_length": 325.2421875, "completions/mean_terminated_length": 308.2524719238281, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3335127532482147, "epoch": 0.0805735745988392, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1867909590135399, "learning_rate": 8.020477815699659e-07, "loss": 0.1035, "num_tokens": 167956366.0, "reward": 1.08740234375, "reward_std": 0.1578659862279892, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98974609375, "rewards/tag_count_reward/std": 0.07512112706899643, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1330.0, "completions/mean_length": 366.375, "completions/mean_terminated_length": 359.7804260253906, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.3048517107963562, "epoch": 0.08091498805052919, "frac_reward_zero_std": 0.5625, "grad_norm": 0.15262321991842295, "learning_rate": 8.054607508532423e-07, "loss": 0.0194, "num_tokens": 168227038.0, "reward": 1.0712890625, "reward_std": 0.13373082876205444, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.051642172038555145, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1960.0, "completions/mean_length": 384.482421875, "completions/mean_terminated_length": 358.077392578125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.3046407848596573, "epoch": 0.08125640150221919, "frac_reward_zero_std": 0.625, "grad_norm": 0.1331783612264827, "learning_rate": 8.088737201365188e-07, "loss": 0.0546, "num_tokens": 168497957.0, "reward": 1.0390625, "reward_std": 0.12105914950370789, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.08485843241214752, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1932.0, "completions/mean_length": 323.3984375, "completions/mean_terminated_length": 313.2337951660156, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.3607698678970337, "epoch": 0.08159781495390918, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1937608651533803, "learning_rate": 8.122866894197952e-07, "loss": 0.0518, "num_tokens": 168744209.0, "reward": 1.09716796875, "reward_std": 0.1533757597208023, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.06321259588003159, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1641.0, "completions/mean_length": 323.171875, "completions/mean_terminated_length": 316.4078674316406, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.3498314991593361, "epoch": 0.08193922840559918, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1704998383816611, "learning_rate": 8.156996587030717e-07, "loss": 0.0505, "num_tokens": 168983145.0, "reward": 1.1025390625, "reward_std": 0.14679324626922607, "rewards/accuracy_reward/mean": 0.1088709682226181, "rewards/accuracy_reward/std": 0.31179171800613403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1388.0, "completions/mean_length": 331.9453125, "completions/mean_terminated_length": 311.59686279296875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.3493944779038429, "epoch": 0.08228064185728917, "frac_reward_zero_std": 0.5625, "grad_norm": 0.16440699504979278, "learning_rate": 8.19112627986348e-07, "loss": 0.0639, "num_tokens": 169226461.0, "reward": 1.080078125, "reward_std": 0.14026962220668793, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.990234375, "rewards/tag_count_reward/std": 0.0775880515575409, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1704.0, "completions/mean_length": 348.33984375, "completions/mean_terminated_length": 328.185791015625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.3353010565042496, "epoch": 0.08262205530897918, "frac_reward_zero_std": 0.46875, "grad_norm": 0.16172626307218466, "learning_rate": 8.225255972696245e-07, "loss": 0.0913, "num_tokens": 169483307.0, "reward": 1.09521484375, "reward_std": 0.15052729845046997, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98974609375, "rewards/tag_count_reward/std": 0.0843261182308197, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1682.0, "completions/mean_length": 326.89453125, "completions/mean_terminated_length": 323.52642822265625, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.3333131670951843, "epoch": 0.08296346876066917, "frac_reward_zero_std": 0.625, "grad_norm": 0.1680080938163792, "learning_rate": 8.259385665529009e-07, "loss": 0.0335, "num_tokens": 169730149.0, "reward": 1.1240234375, "reward_std": 0.12905356287956238, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.041276250034570694, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1091.0, "completions/mean_length": 322.19921875, "completions/mean_terminated_length": 318.8219299316406, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.3453241065144539, "epoch": 0.08330488221235917, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1578877046100359, "learning_rate": 8.293515358361775e-07, "loss": 0.0302, "num_tokens": 169973755.0, "reward": 1.09228515625, "reward_std": 0.11627434939146042, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1842.0, "completions/mean_length": 346.984375, "completions/mean_terminated_length": 340.3137512207031, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.3258212059736252, "epoch": 0.08364629566404916, "frac_reward_zero_std": 0.71875, "grad_norm": 0.1310373072905343, "learning_rate": 8.327645051194539e-07, "loss": 0.0398, "num_tokens": 170225667.0, "reward": 1.0390625, "reward_std": 0.08565235137939453, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.05608600005507469, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1564.0, "completions/mean_length": 370.287109375, "completions/mean_terminated_length": 353.7416076660156, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.32537565380334854, "epoch": 0.08398770911573916, "frac_reward_zero_std": 0.65625, "grad_norm": 0.14698010343754273, "learning_rate": 8.361774744027303e-07, "loss": 0.0894, "num_tokens": 170494406.0, "reward": 1.0615234375, "reward_std": 0.1269083023071289, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9892578125, "rewards/tag_count_reward/std": 0.083536297082901, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1340.0, "completions/mean_length": 374.189453125, "completions/mean_terminated_length": 350.9881286621094, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.31383174657821655, "epoch": 0.08432912256742915, "frac_reward_zero_std": 0.53125, "grad_norm": 0.14822198543378248, "learning_rate": 8.395904436860067e-07, "loss": 0.071, "num_tokens": 170767783.0, "reward": 1.0751953125, "reward_std": 0.14658299088478088, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9873046875, "rewards/tag_count_reward/std": 0.09809815883636475, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 366.107421875, "completions/mean_terminated_length": 356.19451904296875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.31980784982442856, "epoch": 0.08467053601911916, "frac_reward_zero_std": 0.625, "grad_norm": 0.1364209677939243, "learning_rate": 8.430034129692832e-07, "loss": 0.0577, "num_tokens": 171036750.0, "reward": 1.07958984375, "reward_std": 0.10543649643659592, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.050489041954278946, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1907.0, "completions/mean_length": 327.22265625, "completions/mean_terminated_length": 310.2524719238281, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.3485024869441986, "epoch": 0.08501194947080914, "frac_reward_zero_std": 0.59375, "grad_norm": 0.16453238234244175, "learning_rate": 8.464163822525597e-07, "loss": 0.1019, "num_tokens": 171285168.0, "reward": 1.06494140625, "reward_std": 0.14472603797912598, "rewards/accuracy_reward/mean": 0.07459677755832672, "rewards/accuracy_reward/std": 0.263004869222641, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1409.0, "completions/mean_length": 345.283203125, "completions/mean_terminated_length": 318.2559814453125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.3300977572798729, "epoch": 0.08535336292249915, "frac_reward_zero_std": 0.59375, "grad_norm": 0.15372355858185224, "learning_rate": 8.498293515358362e-07, "loss": 0.0809, "num_tokens": 171534097.0, "reward": 1.05908203125, "reward_std": 0.14251339435577393, "rewards/accuracy_reward/mean": 0.07708333432674408, "rewards/accuracy_reward/std": 0.2670018970966339, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98681640625, "rewards/tag_count_reward/std": 0.09486328065395355, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1498.0, "completions/mean_length": 342.04296875, "completions/mean_terminated_length": 325.21893310546875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.34339532256126404, "epoch": 0.08569477637418914, "frac_reward_zero_std": 0.65625, "grad_norm": 0.15878266071594602, "learning_rate": 8.532423208191127e-07, "loss": 0.1067, "num_tokens": 171784775.0, "reward": 1.07177734375, "reward_std": 0.11504203081130981, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99169921875, "rewards/tag_count_reward/std": 0.0753624215722084, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1838.0, "completions/mean_length": 353.69140625, "completions/mean_terminated_length": 340.35040283203125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.32888951152563095, "epoch": 0.08603618982587914, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14909358551204485, "learning_rate": 8.56655290102389e-07, "loss": 0.0761, "num_tokens": 172052537.0, "reward": 1.04541015625, "reward_std": 0.12174312770366669, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.06867539137601852, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1615.0, "completions/mean_length": 375.791015625, "completions/mean_terminated_length": 355.96246337890625, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.3057410195469856, "epoch": 0.08637760327756913, "frac_reward_zero_std": 0.53125, "grad_norm": 0.26852809560395113, "learning_rate": 8.600682593856655e-07, "loss": 0.0957, "num_tokens": 172333774.0, "reward": 1.041015625, "reward_std": 0.12753859162330627, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.0862877294421196, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1405.0, "completions/mean_length": 343.72265625, "completions/mean_terminated_length": 330.30316162109375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.31345490366220474, "epoch": 0.08671901672925914, "frac_reward_zero_std": 0.53125, "grad_norm": 0.15413973814487367, "learning_rate": 8.634812286689419e-07, "loss": 0.0861, "num_tokens": 172578800.0, "reward": 1.125, "reward_std": 0.17781919240951538, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.062285590916872025, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1236.0, "completions/mean_length": 349.171875, "completions/mean_terminated_length": 325.623779296875, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.31374574452638626, "epoch": 0.08706043018094912, "frac_reward_zero_std": 0.4375, "grad_norm": 0.18062981732989172, "learning_rate": 8.668941979522184e-07, "loss": 0.1279, "num_tokens": 172841320.0, "reward": 1.0888671875, "reward_std": 0.17702364921569824, "rewards/accuracy_reward/mean": 0.10483870655298233, "rewards/accuracy_reward/std": 0.30665475130081177, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9873046875, "rewards/tag_count_reward/std": 0.09557202458381653, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1778.0, "completions/mean_length": 365.06640625, "completions/mean_terminated_length": 351.8149719238281, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.3165179640054703, "epoch": 0.08740184363263913, "frac_reward_zero_std": 0.53125, "grad_norm": 0.16968649582119683, "learning_rate": 8.703071672354948e-07, "loss": 0.084, "num_tokens": 173107738.0, "reward": 1.06689453125, "reward_std": 0.13615740835666656, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98681640625, "rewards/tag_count_reward/std": 0.08818135410547256, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1425.0, "completions/mean_length": 354.66015625, "completions/mean_terminated_length": 320.92828369140625, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.3210231214761734, "epoch": 0.08774325708432912, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1801397659678604, "learning_rate": 8.737201365187713e-07, "loss": 0.1931, "num_tokens": 173364524.0, "reward": 1.0693359375, "reward_std": 0.16703152656555176, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9853515625, "rewards/tag_count_reward/std": 0.10388854891061783, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1757.0, "completions/mean_length": 404.544921875, "completions/mean_terminated_length": 378.4583435058594, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.2882358655333519, "epoch": 0.08808467053601912, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1393023475865159, "learning_rate": 8.771331058020477e-07, "loss": 0.0715, "num_tokens": 173640627.0, "reward": 1.0234375, "reward_std": 0.12133342772722244, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.10014666616916656, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1297.0, "completions/mean_length": 350.388671875, "completions/mean_terminated_length": 347.0665283203125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.3163047954440117, "epoch": 0.08842608398770911, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1497100757714393, "learning_rate": 8.805460750853242e-07, "loss": 0.0287, "num_tokens": 173894458.0, "reward": 1.13818359375, "reward_std": 0.13633711636066437, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05285605043172836, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1834.0, "completions/mean_length": 378.626953125, "completions/mean_terminated_length": 365.4822692871094, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.3137529566884041, "epoch": 0.08876749743939912, "frac_reward_zero_std": 0.65625, "grad_norm": 0.14180331542831665, "learning_rate": 8.839590443686007e-07, "loss": 0.0123, "num_tokens": 174167579.0, "reward": 1.03466796875, "reward_std": 0.10436594486236572, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.06697065383195877, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 377.419921875, "completions/mean_terminated_length": 344.1414489746094, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.32930584996938705, "epoch": 0.0891089108910891, "frac_reward_zero_std": 0.65625, "grad_norm": 0.16424883801078313, "learning_rate": 8.873720136518771e-07, "loss": 0.1043, "num_tokens": 174435010.0, "reward": 1.02490234375, "reward_std": 0.10302825272083282, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98388671875, "rewards/tag_count_reward/std": 0.10657967627048492, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 329.3671875, "completions/mean_terminated_length": 326.00390625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.3528459370136261, "epoch": 0.08945032434277911, "frac_reward_zero_std": 0.4375, "grad_norm": 0.20451910763070216, "learning_rate": 8.907849829351535e-07, "loss": 0.0195, "num_tokens": 174678574.0, "reward": 1.13134765625, "reward_std": 0.21066230535507202, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1681.0, "completions/max_terminated_length": 1681.0, "completions/mean_length": 324.232421875, "completions/mean_terminated_length": 324.232421875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.31950196623802185, "epoch": 0.0897917377944691, "frac_reward_zero_std": 0.6875, "grad_norm": 0.13646284793943467, "learning_rate": 8.941979522184299e-07, "loss": 0.0005, "num_tokens": 174918357.0, "reward": 1.11328125, "reward_std": 0.1343458741903305, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 379.255859375, "completions/mean_terminated_length": 366.11614990234375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.3276415020227432, "epoch": 0.0901331512461591, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1533748061447726, "learning_rate": 8.976109215017065e-07, "loss": 0.058, "num_tokens": 175196104.0, "reward": 1.03662109375, "reward_std": 0.10469317436218262, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99169921875, "rewards/tag_count_reward/std": 0.06856399029493332, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1742.0, "completions/mean_length": 367.916015625, "completions/mean_terminated_length": 351.3471374511719, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.32223908603191376, "epoch": 0.09047456469784909, "frac_reward_zero_std": 0.625, "grad_norm": 0.14311017323420458, "learning_rate": 9.010238907849829e-07, "loss": 0.1118, "num_tokens": 175466189.0, "reward": 1.10205078125, "reward_std": 0.13318349421024323, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.0704338550567627, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1025.0, "completions/mean_length": 381.341796875, "completions/mean_terminated_length": 368.218505859375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.3005932420492172, "epoch": 0.0908159781495391, "frac_reward_zero_std": 0.59375, "grad_norm": 0.147283062730247, "learning_rate": 9.044368600682594e-07, "loss": 0.0841, "num_tokens": 175736812.0, "reward": 1.0791015625, "reward_std": 0.1375274658203125, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1556.0, "completions/mean_length": 374.05859375, "completions/mean_terminated_length": 360.8779602050781, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.3017735555768013, "epoch": 0.09115739160122908, "frac_reward_zero_std": 0.5, "grad_norm": 0.15120831840655832, "learning_rate": 9.078498293515358e-07, "loss": 0.0384, "num_tokens": 176006538.0, "reward": 1.130859375, "reward_std": 0.17605742812156677, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1471.0, "completions/mean_length": 371.408203125, "completions/mean_terminated_length": 358.2066955566406, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.32265833020210266, "epoch": 0.09149880505291909, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14186185490620237, "learning_rate": 9.112627986348122e-07, "loss": 0.0748, "num_tokens": 176273451.0, "reward": 1.072265625, "reward_std": 0.12172654271125793, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.07294141501188278, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1657.0, "completions/mean_length": 356.46875, "completions/mean_terminated_length": 343.14959716796875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.3403845429420471, "epoch": 0.09184021850460908, "frac_reward_zero_std": 0.4375, "grad_norm": 0.17895076324949058, "learning_rate": 9.146757679180886e-07, "loss": 0.0381, "num_tokens": 176528731.0, "reward": 1.1142578125, "reward_std": 0.22252048552036285, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.0678301453590393, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1409.0, "completions/mean_length": 386.794921875, "completions/mean_terminated_length": 353.70318603515625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.3001314625144005, "epoch": 0.09218163195629908, "frac_reward_zero_std": 0.5625, "grad_norm": 0.14374762874886957, "learning_rate": 9.180887372013651e-07, "loss": 0.1639, "num_tokens": 176798130.0, "reward": 1.1103515625, "reward_std": 0.14695771038532257, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9853515625, "rewards/tag_count_reward/std": 0.10388854891061783, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1468.0, "completions/mean_length": 349.689453125, "completions/mean_terminated_length": 346.3659362792969, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 0.31350842863321304, "epoch": 0.09252304540798907, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1262674577779299, "learning_rate": 9.215017064846417e-07, "loss": 0.0073, "num_tokens": 177051091.0, "reward": 1.1015625, "reward_std": 0.10915680229663849, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 352.349609375, "completions/mean_terminated_length": 338.9980163574219, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.33668920397758484, "epoch": 0.09286445885967907, "frac_reward_zero_std": 0.59375, "grad_norm": 0.16562429853813251, "learning_rate": 9.249146757679181e-07, "loss": 0.0622, "num_tokens": 177305590.0, "reward": 1.07568359375, "reward_std": 0.12737616896629333, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99169921875, "rewards/tag_count_reward/std": 0.0753624215722084, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1835.0, "completions/mean_length": 350.22265625, "completions/mean_terminated_length": 336.8543395996094, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.32842738181352615, "epoch": 0.09320587231136906, "frac_reward_zero_std": 0.375, "grad_norm": 0.1854661242516218, "learning_rate": 9.283276450511945e-07, "loss": 0.0709, "num_tokens": 177562968.0, "reward": 1.10791015625, "reward_std": 0.19689175486564636, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.06867539137601852, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1306.0, "completions/mean_length": 381.8046875, "completions/mean_terminated_length": 365.3727722167969, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.2850857302546501, "epoch": 0.09354728576305907, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1154833413775541, "learning_rate": 9.317406143344709e-07, "loss": 0.0751, "num_tokens": 177829828.0, "reward": 1.0341796875, "reward_std": 0.11371214687824249, "rewards/accuracy_reward/mean": 0.04435483738780022, "rewards/accuracy_reward/std": 0.2060900777578354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.07611466944217682, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1894.0, "completions/mean_length": 359.904296875, "completions/mean_terminated_length": 353.2843322753906, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.29632118344306946, "epoch": 0.09388869921474906, "frac_reward_zero_std": 0.75, "grad_norm": 0.11251876168939856, "learning_rate": 9.351535836177474e-07, "loss": 0.0029, "num_tokens": 178097155.0, "reward": 1.04345703125, "reward_std": 0.09095922112464905, "rewards/accuracy_reward/mean": 0.04838709533214569, "rewards/accuracy_reward/std": 0.21479946374893188, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 416.6953125, "completions/mean_terminated_length": 403.85040283203125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.27252547442913055, "epoch": 0.09423011266643906, "frac_reward_zero_std": 0.46875, "grad_norm": 0.17643468619340813, "learning_rate": 9.385665529010238e-07, "loss": 0.0834, "num_tokens": 178382839.0, "reward": 1.1201171875, "reward_std": 0.17883071303367615, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06960996240377426, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1610.0, "completions/mean_length": 376.921875, "completions/mean_terminated_length": 347.0218505859375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.3187052458524704, "epoch": 0.09457152611812905, "frac_reward_zero_std": 0.5625, "grad_norm": 0.16252554460494695, "learning_rate": 9.419795221843004e-07, "loss": 0.1275, "num_tokens": 178650143.0, "reward": 1.06884765625, "reward_std": 0.14411534368991852, "rewards/accuracy_reward/mean": 0.08467742055654526, "rewards/accuracy_reward/std": 0.278682142496109, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98681640625, "rewards/tag_count_reward/std": 0.09865544736385345, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1572.0, "completions/mean_length": 383.900390625, "completions/mean_terminated_length": 364.1679992675781, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.3247288092970848, "epoch": 0.09491293956981905, "frac_reward_zero_std": 0.5625, "grad_norm": 0.14785361308636039, "learning_rate": 9.453924914675768e-07, "loss": 0.0944, "num_tokens": 178922652.0, "reward": 1.1044921875, "reward_std": 0.1680164337158203, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.08079168945550919, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1777.0, "completions/mean_length": 446.966796875, "completions/mean_terminated_length": 371.6911926269531, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3138120546936989, "epoch": 0.09525435302150904, "frac_reward_zero_std": 0.5, "grad_norm": 0.1470703775133425, "learning_rate": 9.488054607508532e-07, "loss": 0.1557, "num_tokens": 179238939.0, "reward": 1.07763671875, "reward_std": 0.16149860620498657, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96630859375, "rewards/tag_count_reward/std": 0.1515175849199295, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1115.0, "completions/mean_length": 375.640625, "completions/mean_terminated_length": 369.0823669433594, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.30660589039325714, "epoch": 0.09559576647319905, "frac_reward_zero_std": 0.65625, "grad_norm": 0.14258559132287324, "learning_rate": 9.522184300341296e-07, "loss": 0.0195, "num_tokens": 179500067.0, "reward": 1.04052734375, "reward_std": 0.11066632717847824, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.05711371824145317, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1475.0, "completions/mean_length": 361.328125, "completions/mean_terminated_length": 348.0472412109375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.3152089938521385, "epoch": 0.09593717992488904, "frac_reward_zero_std": 0.59375, "grad_norm": 0.15359706195236136, "learning_rate": 9.556313993174062e-07, "loss": 0.0799, "num_tokens": 179759787.0, "reward": 1.0966796875, "reward_std": 0.1391814947128296, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06960996240377426, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1869.0, "completions/mean_length": 399.095703125, "completions/mean_terminated_length": 386.1122131347656, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.28338416665792465, "epoch": 0.09627859337657904, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13344915142371033, "learning_rate": 9.590443686006826e-07, "loss": 0.0433, "num_tokens": 180034652.0, "reward": 1.10888671875, "reward_std": 0.140140563249588, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.06697065383195877, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1744.0, "completions/mean_length": 357.173828125, "completions/mean_terminated_length": 353.8649597167969, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.33597178012132645, "epoch": 0.09662000682826903, "frac_reward_zero_std": 0.71875, "grad_norm": 0.14311198868604533, "learning_rate": 9.62457337883959e-07, "loss": 0.0184, "num_tokens": 180305813.0, "reward": 1.0263671875, "reward_std": 0.07543058693408966, "rewards/accuracy_reward/mean": 0.029296875, "rewards/accuracy_reward/std": 0.16880230605602264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.041276250034570694, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1445.0, "completions/mean_length": 336.962890625, "completions/mean_terminated_length": 320.0887756347656, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.3075854331254959, "epoch": 0.09696142027995903, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1862102520024603, "learning_rate": 9.658703071672355e-07, "loss": 0.0774, "num_tokens": 180546770.0, "reward": 1.146484375, "reward_std": 0.21961520612239838, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.07459938526153564, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1290.0, "completions/mean_length": 392.046875, "completions/mean_terminated_length": 375.7159729003906, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.30396026372909546, "epoch": 0.09730283373164902, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13815482374753607, "learning_rate": 9.69283276450512e-07, "loss": 0.0629, "num_tokens": 180830890.0, "reward": 1.181640625, "reward_std": 0.17109599709510803, "rewards/accuracy_reward/mean": 0.189453125, "rewards/accuracy_reward/std": 0.3922513723373413, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.07459938526153564, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1574.0, "completions/mean_length": 434.8828125, "completions/mean_terminated_length": 386.1971740722656, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.26808659732341766, "epoch": 0.09764424718333903, "frac_reward_zero_std": 0.53125, "grad_norm": 0.13025725996813536, "learning_rate": 9.726962457337883e-07, "loss": 0.0916, "num_tokens": 181132894.0, "reward": 1.099609375, "reward_std": 0.17192813754081726, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.11539676785469055, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1294.0, "completions/mean_length": 399.921875, "completions/mean_terminated_length": 380.37945556640625, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.26852068677544594, "epoch": 0.09798566063502902, "frac_reward_zero_std": 0.5, "grad_norm": 0.17411489518967946, "learning_rate": 9.761092150170647e-07, "loss": 0.1091, "num_tokens": 181424566.0, "reward": 1.07861328125, "reward_std": 0.17468351125717163, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98876953125, "rewards/tag_count_reward/std": 0.08564164489507675, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 394.025390625, "completions/mean_terminated_length": 367.7718505859375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.3022923320531845, "epoch": 0.09832707408671902, "frac_reward_zero_std": 0.6875, "grad_norm": 0.15064433454094484, "learning_rate": 9.795221843003413e-07, "loss": 0.1022, "num_tokens": 181707619.0, "reward": 1.029296875, "reward_std": 0.10611555725336075, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.09310565888881683, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1390.0, "completions/mean_length": 357.072265625, "completions/mean_terminated_length": 350.4411926269531, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3134869858622551, "epoch": 0.09866848753840901, "frac_reward_zero_std": 0.5625, "grad_norm": 0.14662201196282293, "learning_rate": 9.829351535836176e-07, "loss": 0.0481, "num_tokens": 181968152.0, "reward": 1.10791015625, "reward_std": 0.14763736724853516, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 341.533203125, "completions/mean_terminated_length": 334.8411865234375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.31528591364622116, "epoch": 0.09900990099009901, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12387257445275016, "learning_rate": 9.863481228668942e-07, "loss": 0.0233, "num_tokens": 182218777.0, "reward": 1.08837890625, "reward_std": 0.10366291552782059, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04269581660628319, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 420.7421875, "completions/mean_terminated_length": 407.92913818359375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.26595553755760193, "epoch": 0.099351314441789, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11178174077214716, "learning_rate": 9.897610921501706e-07, "loss": 0.0608, "num_tokens": 182511637.0, "reward": 1.08984375, "reward_std": 0.12279774248600006, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 358.1015625, "completions/mean_terminated_length": 351.4745178222656, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.2972301319241524, "epoch": 0.09969272789347901, "frac_reward_zero_std": 0.53125, "grad_norm": 0.14673744515126916, "learning_rate": 9.93174061433447e-07, "loss": 0.0277, "num_tokens": 182770505.0, "reward": 1.08349609375, "reward_std": 0.14180773496627808, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05285605043172836, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1770.0, "completions/mean_length": 350.09375, "completions/mean_terminated_length": 336.7243957519531, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.33250968903303146, "epoch": 0.100034141345169, "frac_reward_zero_std": 0.5, "grad_norm": 0.1595946108873664, "learning_rate": 9.965870307167234e-07, "loss": 0.0842, "num_tokens": 183025785.0, "reward": 1.123046875, "reward_std": 0.1901281774044037, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1149.0, "completions/mean_length": 367.4609375, "completions/mean_terminated_length": 347.5335998535156, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.30761604756116867, "epoch": 0.100375554796859, "frac_reward_zero_std": 0.65625, "grad_norm": 0.15095857050337932, "learning_rate": 1e-06, "loss": 0.1268, "num_tokens": 183291221.0, "reward": 1.052734375, "reward_std": 0.12014563381671906, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.07459938526153564, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1259.0, "completions/mean_length": 360.474609375, "completions/mean_terminated_length": 350.52850341796875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.2988385930657387, "epoch": 0.10071696824854899, "frac_reward_zero_std": 0.875, "grad_norm": 0.0744959971370328, "learning_rate": 9.999996804113108e-07, "loss": 0.0498, "num_tokens": 183549560.0, "reward": 1.02099609375, "reward_std": 0.04208708181977272, "rewards/accuracy_reward/mean": 0.025390625, "rewards/accuracy_reward/std": 0.15746226906776428, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1405.0, "completions/mean_length": 356.03515625, "completions/mean_terminated_length": 349.4000244140625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.29427924007177353, "epoch": 0.101058381700239, "frac_reward_zero_std": 0.625, "grad_norm": 0.15154605365233112, "learning_rate": 9.999987216456977e-07, "loss": 0.0597, "num_tokens": 183803610.0, "reward": 1.1162109375, "reward_std": 0.13455097377300262, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.041276250034570694, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1719.0, "completions/mean_length": 398.662109375, "completions/mean_terminated_length": 372.482177734375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.2942073494195938, "epoch": 0.10139979515192898, "frac_reward_zero_std": 0.5, "grad_norm": 0.16534577936968609, "learning_rate": 9.999971237045224e-07, "loss": 0.1045, "num_tokens": 184085709.0, "reward": 1.087890625, "reward_std": 0.15244987607002258, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.990234375, "rewards/tag_count_reward/std": 0.0775880515575409, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1294.0, "completions/mean_length": 372.6796875, "completions/mean_terminated_length": 362.8055114746094, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.2918025627732277, "epoch": 0.10174120860361899, "frac_reward_zero_std": 0.5625, "grad_norm": 0.14015878341567778, "learning_rate": 9.999948865900542e-07, "loss": 0.0884, "num_tokens": 184350089.0, "reward": 1.11572265625, "reward_std": 0.16483499109745026, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.059313252568244934, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 371.193359375, "completions/mean_terminated_length": 344.577392578125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.3130280449986458, "epoch": 0.10208262205530898, "frac_reward_zero_std": 0.40625, "grad_norm": 0.17883197417638969, "learning_rate": 9.999920103054712e-07, "loss": 0.1002, "num_tokens": 184620540.0, "reward": 1.10986328125, "reward_std": 0.20786109566688538, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98681640625, "rewards/tag_count_reward/std": 0.09614396095275879, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1760.0, "completions/mean_length": 375.564453125, "completions/mean_terminated_length": 362.3956604003906, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.30452442169189453, "epoch": 0.10242403550699898, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1295873337854741, "learning_rate": 9.999884948548586e-07, "loss": 0.0533, "num_tokens": 184893885.0, "reward": 1.0673828125, "reward_std": 0.12060447037220001, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06960996240377426, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1274.0, "completions/mean_length": 327.03125, "completions/mean_terminated_length": 323.66339111328125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.31711968034505844, "epoch": 0.10276544895868897, "frac_reward_zero_std": 0.53125, "grad_norm": 0.19436467987107683, "learning_rate": 9.999843402432097e-07, "loss": 0.0053, "num_tokens": 185135773.0, "reward": 1.10205078125, "reward_std": 0.18026554584503174, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.045470330864191055, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1876.0, "completions/mean_length": 355.19921875, "completions/mean_terminated_length": 345.2220153808594, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.3087913691997528, "epoch": 0.10310686241037897, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11945933021604876, "learning_rate": 9.999795464764258e-07, "loss": 0.0637, "num_tokens": 185395523.0, "reward": 1.10888671875, "reward_std": 0.11574747413396835, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 420.1953125, "completions/mean_terminated_length": 367.71368408203125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.2637704908847809, "epoch": 0.10344827586206896, "frac_reward_zero_std": 0.71875, "grad_norm": 0.129069176493238, "learning_rate": 9.99974113561316e-07, "loss": 0.0694, "num_tokens": 185695607.0, "reward": 1.06787109375, "reward_std": 0.08110800385475159, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97607421875, "rewards/tag_count_reward/std": 0.12721163034439087, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1609.0, "completions/mean_length": 381.2734375, "completions/mean_terminated_length": 358.1703186035156, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.29284531623125076, "epoch": 0.10378968931375897, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14809538684841123, "learning_rate": 9.999680415055969e-07, "loss": 0.0587, "num_tokens": 185966931.0, "reward": 1.06591796875, "reward_std": 0.150752991437912, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98974609375, "rewards/tag_count_reward/std": 0.0843261182308197, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1145.0, "completions/mean_length": 387.451171875, "completions/mean_terminated_length": 384.2015686035156, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.2558222711086273, "epoch": 0.10413110276544896, "frac_reward_zero_std": 0.625, "grad_norm": 0.11657068719449286, "learning_rate": 9.999613303178934e-07, "loss": 0.0137, "num_tokens": 186238762.0, "reward": 1.10205078125, "reward_std": 0.13772177696228027, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1308.0, "completions/mean_length": 383.921875, "completions/mean_terminated_length": 337.1666564941406, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.32230112701654434, "epoch": 0.10447251621713896, "frac_reward_zero_std": 0.53125, "grad_norm": 0.14816735135315826, "learning_rate": 9.999539800077384e-07, "loss": 0.0468, "num_tokens": 186518754.0, "reward": 1.0927734375, "reward_std": 0.12933553755283356, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9755859375, "rewards/tag_count_reward/std": 0.12855377793312073, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1508.0, "completions/mean_length": 331.8203125, "completions/mean_terminated_length": 328.4618225097656, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.30123022198677063, "epoch": 0.10481392966882895, "frac_reward_zero_std": 0.78125, "grad_norm": 0.11525521402763794, "learning_rate": 9.999459905855716e-07, "loss": 0.0064, "num_tokens": 186760918.0, "reward": 1.06494140625, "reward_std": 0.07886696606874466, "rewards/accuracy_reward/mean": 0.07083333283662796, "rewards/accuracy_reward/std": 0.2568138837814331, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1424.0, "completions/mean_length": 370.048828125, "completions/mean_terminated_length": 363.4686584472656, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.30207910388708115, "epoch": 0.10515534312051895, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14716363986013806, "learning_rate": 9.999373620627412e-07, "loss": 0.0257, "num_tokens": 187030351.0, "reward": 1.15576171875, "reward_std": 0.15848655998706818, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05285605043172836, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1889.0, "completions/mean_length": 352.37890625, "completions/mean_terminated_length": 349.0606689453125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.2777985520660877, "epoch": 0.10549675657220894, "frac_reward_zero_std": 0.6875, "grad_norm": 0.13422490526648326, "learning_rate": 9.999280944515035e-07, "loss": 0.0144, "num_tokens": 187287521.0, "reward": 1.0712890625, "reward_std": 0.10727354884147644, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.041276250034570694, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1254.0, "completions/mean_length": 310.615234375, "completions/mean_terminated_length": 307.21527099609375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.30293118953704834, "epoch": 0.10583817002389895, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1472192616598218, "learning_rate": 9.99918187765022e-07, "loss": 0.0332, "num_tokens": 187518876.0, "reward": 1.09130859375, "reward_std": 0.12171714007854462, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.039800092577934265, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1823.0, "completions/mean_length": 342.078125, "completions/mean_terminated_length": 335.38824462890625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.30367131531238556, "epoch": 0.10617958347558894, "frac_reward_zero_std": 0.65625, "grad_norm": 0.14616740482061188, "learning_rate": 9.99907642017368e-07, "loss": 0.0225, "num_tokens": 187773188.0, "reward": 1.05859375, "reward_std": 0.09912148118019104, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.051725368946790695, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1500.0, "completions/mean_length": 353.77734375, "completions/mean_terminated_length": 343.791748046875, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.29075247049331665, "epoch": 0.10652099692727894, "frac_reward_zero_std": 0.75, "grad_norm": 0.12292419634437332, "learning_rate": 9.998964572235205e-07, "loss": 0.0303, "num_tokens": 188028866.0, "reward": 1.0185546875, "reward_std": 0.07191826403141022, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15143637359142303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1336.0, "completions/mean_length": 391.271484375, "completions/mean_terminated_length": 384.7745361328125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.27139677852392197, "epoch": 0.10686241037896893, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1492486054783259, "learning_rate": 9.998846333993667e-07, "loss": 0.0536, "num_tokens": 188301341.0, "reward": 1.10546875, "reward_std": 0.15478020906448364, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.051725368946790695, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1798.0, "completions/mean_length": 353.181640625, "completions/mean_terminated_length": 349.8649597167969, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.28214485943317413, "epoch": 0.10720382383065893, "frac_reward_zero_std": 0.75, "grad_norm": 0.13193548327052018, "learning_rate": 9.99872170561701e-07, "loss": -0.002, "num_tokens": 188560970.0, "reward": 1.09033203125, "reward_std": 0.0992579534649849, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1245.0, "completions/mean_length": 422.0703125, "completions/mean_terminated_length": 359.4401550292969, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.27977199852466583, "epoch": 0.10754523728234892, "frac_reward_zero_std": 0.75, "grad_norm": 0.1022256167091299, "learning_rate": 9.99859068728225e-07, "loss": 0.0225, "num_tokens": 188852558.0, "reward": 1.06201171875, "reward_std": 0.08948306739330292, "rewards/accuracy_reward/mean": 0.0927419364452362, "rewards/accuracy_reward/std": 0.2903633117675781, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97216796875, "rewards/tag_count_reward/std": 0.14191101491451263, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1341.0, "completions/mean_length": 372.404296875, "completions/mean_terminated_length": 362.52850341796875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.26579179614782333, "epoch": 0.10788665073403893, "frac_reward_zero_std": 0.625, "grad_norm": 0.11806714416320595, "learning_rate": 9.998453279175492e-07, "loss": 0.0619, "num_tokens": 189119277.0, "reward": 1.11669921875, "reward_std": 0.11771059781312943, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1356.0, "completions/mean_length": 339.58984375, "completions/mean_terminated_length": 332.8902282714844, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.3004022315144539, "epoch": 0.10822806418572892, "frac_reward_zero_std": 0.5625, "grad_norm": 0.15127815074444756, "learning_rate": 9.998309481491906e-07, "loss": 0.026, "num_tokens": 189368507.0, "reward": 1.1318359375, "reward_std": 0.17705217003822327, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1376.0, "completions/mean_length": 339.220703125, "completions/mean_terminated_length": 315.53466796875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.3088148832321167, "epoch": 0.10856947763741892, "frac_reward_zero_std": 0.5625, "grad_norm": 0.16224833151347673, "learning_rate": 9.998159294435742e-07, "loss": 0.1144, "num_tokens": 189613436.0, "reward": 1.1103515625, "reward_std": 0.1493547558784485, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310528099536896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9892578125, "rewards/tag_count_reward/std": 0.0849878340959549, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1949.0, "completions/mean_length": 376.20703125, "completions/mean_terminated_length": 356.3834228515625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 0.2895462065935135, "epoch": 0.10891089108910891, "frac_reward_zero_std": 0.5625, "grad_norm": 1.0045487004873368, "learning_rate": 9.998002718220323e-07, "loss": 0.1029, "num_tokens": 189876982.0, "reward": 1.0849609375, "reward_std": 0.14957909286022186, "rewards/accuracy_reward/mean": 0.09677419066429138, "rewards/accuracy_reward/std": 0.2959485352039337, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.08079168945550919, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1638.0, "completions/mean_length": 355.134765625, "completions/mean_terminated_length": 348.49609375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.29261697828769684, "epoch": 0.10925230454079891, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12460241166294483, "learning_rate": 9.997839753068054e-07, "loss": 0.042, "num_tokens": 190141227.0, "reward": 1.0849609375, "reward_std": 0.1019776314496994, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1715.0, "completions/mean_length": 381.76171875, "completions/mean_terminated_length": 348.5697326660156, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.26068638265132904, "epoch": 0.1095937179924889, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1430077392944546, "learning_rate": 9.997670399210405e-07, "loss": 0.1309, "num_tokens": 190411729.0, "reward": 1.1220703125, "reward_std": 0.16740170121192932, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9853515625, "rewards/tag_count_reward/std": 0.10388854891061783, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1269.0, "completions/mean_length": 333.16796875, "completions/mean_terminated_length": 323.0609130859375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.27038320153951645, "epoch": 0.10993513144417891, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1438909951201343, "learning_rate": 9.997494656887927e-07, "loss": 0.079, "num_tokens": 190653863.0, "reward": 1.08740234375, "reward_std": 0.1413811892271042, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1715.0, "completions/mean_length": 388.890625, "completions/mean_terminated_length": 385.6438293457031, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.27848416566848755, "epoch": 0.1102765448958689, "frac_reward_zero_std": 0.625, "grad_norm": 0.13977111602545614, "learning_rate": 9.997312526350242e-07, "loss": -0.0007, "num_tokens": 190931583.0, "reward": 1.06396484375, "reward_std": 0.11483728885650635, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.039800092577934265, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1972.0, "completions/mean_length": 380.09765625, "completions/mean_terminated_length": 366.9645690917969, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.2824769839644432, "epoch": 0.1106179583475589, "frac_reward_zero_std": 0.71875, "grad_norm": 0.12116287088575092, "learning_rate": 9.997124007856049e-07, "loss": 0.0861, "num_tokens": 191216641.0, "reward": 1.0234375, "reward_std": 0.0824437215924263, "rewards/accuracy_reward/mean": 0.029296875, "rewards/accuracy_reward/std": 0.16880230605602264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1162.0, "completions/mean_length": 338.779296875, "completions/mean_terminated_length": 332.07647705078125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.2880857139825821, "epoch": 0.11095937179924889, "frac_reward_zero_std": 0.5625, "grad_norm": 0.14664709057644562, "learning_rate": 9.996929101673117e-07, "loss": 0.0233, "num_tokens": 191459072.0, "reward": 1.1103515625, "reward_std": 0.16210860013961792, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1223.0, "completions/mean_length": 389.296875, "completions/mean_terminated_length": 372.9388732910156, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.26224926114082336, "epoch": 0.11130078525093888, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12906625733384455, "learning_rate": 9.996727808078292e-07, "loss": 0.0609, "num_tokens": 191732792.0, "reward": 1.080078125, "reward_std": 0.1390770822763443, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.07459938526153564, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1244.0, "completions/mean_length": 386.943359375, "completions/mean_terminated_length": 370.5621337890625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.25200947746634483, "epoch": 0.11164219870262888, "frac_reward_zero_std": 0.40625, "grad_norm": 0.15462851367834227, "learning_rate": 9.996520127357488e-07, "loss": 0.018, "num_tokens": 192005307.0, "reward": 1.14892578125, "reward_std": 0.2434626817703247, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.36344730854034424, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1781.0, "completions/mean_length": 379.125, "completions/mean_terminated_length": 369.288818359375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.266694150865078, "epoch": 0.11198361215431887, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11406265129366824, "learning_rate": 9.996306059805693e-07, "loss": 0.029, "num_tokens": 192274587.0, "reward": 1.09130859375, "reward_std": 0.10018262267112732, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1658.0, "completions/mean_length": 372.1171875, "completions/mean_terminated_length": 368.83758544921875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.25603753328323364, "epoch": 0.11232502560600888, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12759403353261287, "learning_rate": 9.99608560572697e-07, "loss": 0.028, "num_tokens": 192537335.0, "reward": 1.18310546875, "reward_std": 0.1331692785024643, "rewards/accuracy_reward/mean": 0.185546875, "rewards/accuracy_reward/std": 0.38912075757980347, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.039800092577934265, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1725.0, "completions/mean_length": 398.462890625, "completions/mean_terminated_length": 375.5980224609375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.27717695385217667, "epoch": 0.11266643905769887, "frac_reward_zero_std": 0.625, "grad_norm": 0.1280242733622484, "learning_rate": 9.995858765434448e-07, "loss": 0.067, "num_tokens": 192818244.0, "reward": 1.05126953125, "reward_std": 0.11725736409425735, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98876953125, "rewards/tag_count_reward/std": 0.08845183253288269, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1753.0, "completions/mean_length": 360.59765625, "completions/mean_terminated_length": 347.31103515625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.3031795024871826, "epoch": 0.11300785250938887, "frac_reward_zero_std": 0.59375, "grad_norm": 0.15391015204311242, "learning_rate": 9.995625539250332e-07, "loss": 0.0774, "num_tokens": 193085510.0, "reward": 1.0771484375, "reward_std": 0.1299150586128235, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.0678301453590393, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 370.060546875, "completions/mean_terminated_length": 343.4266052246094, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.28816431015729904, "epoch": 0.11334926596107886, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13965030940795276, "learning_rate": 9.995385927505893e-07, "loss": 0.1237, "num_tokens": 193354325.0, "reward": 1.0703125, "reward_std": 0.13359710574150085, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.09310565888881683, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1819.0, "completions/mean_length": 342.81640625, "completions/mean_terminated_length": 329.3897705078125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.2908841222524643, "epoch": 0.11369067941276886, "frac_reward_zero_std": 0.625, "grad_norm": 0.15813469270355787, "learning_rate": 9.995139930541476e-07, "loss": 0.0676, "num_tokens": 193607911.0, "reward": 1.09619140625, "reward_std": 0.14428135752677917, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1864.0, "completions/mean_length": 381.59375, "completions/mean_terminated_length": 375.058837890625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.2790194973349571, "epoch": 0.11403209286445885, "frac_reward_zero_std": 0.53125, "grad_norm": 0.15608754236169278, "learning_rate": 9.994887548706493e-07, "loss": 0.0299, "num_tokens": 193880519.0, "reward": 1.1171875, "reward_std": 0.1585882157087326, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.051725368946790695, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1888.0, "completions/mean_length": 387.12890625, "completions/mean_terminated_length": 367.4347839355469, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.269361674785614, "epoch": 0.11437350631614886, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13133849527457755, "learning_rate": 9.994628782359422e-07, "loss": 0.0876, "num_tokens": 194155737.0, "reward": 1.16455078125, "reward_std": 0.1564169079065323, "rewards/accuracy_reward/mean": 0.173828125, "rewards/accuracy_reward/std": 0.3793322443962097, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99072265625, "rewards/tag_count_reward/std": 0.0814909115433693, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1834.0, "completions/mean_length": 399.40625, "completions/mean_terminated_length": 389.6896057128906, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.25166933238506317, "epoch": 0.11471491976783885, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1453250735537013, "learning_rate": 9.99436363186782e-07, "loss": 0.0431, "num_tokens": 194434121.0, "reward": 1.18310546875, "reward_std": 0.18678691983222961, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39069411158561707, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1641.0, "completions/mean_length": 417.912109375, "completions/mean_terminated_length": 382.12176513671875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 0.24663875624537468, "epoch": 0.11505633321952885, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1465670234550623, "learning_rate": 9.994092097608302e-07, "loss": 0.0716, "num_tokens": 194724812.0, "reward": 1.10693359375, "reward_std": 0.1897919476032257, "rewards/accuracy_reward/mean": 0.1270161271095276, "rewards/accuracy_reward/std": 0.3333272337913513, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98388671875, "rewards/tag_count_reward/std": 0.10885065048933029, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1766.0, "completions/mean_length": 459.80859375, "completions/mean_terminated_length": 411.9034118652344, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.2443750835955143, "epoch": 0.11539774667121884, "frac_reward_zero_std": 0.6875, "grad_norm": 0.17050862631768396, "learning_rate": 9.993814179966551e-07, "loss": 0.0307, "num_tokens": 195044122.0, "reward": 1.04931640625, "reward_std": 0.10397680103778839, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97509765625, "rewards/tag_count_reward/std": 0.1317501813173294, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1428.0, "completions/mean_length": 395.091796875, "completions/mean_terminated_length": 382.0767822265625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.27678485214710236, "epoch": 0.11573916012290884, "frac_reward_zero_std": 0.5, "grad_norm": 0.1431995322386068, "learning_rate": 9.993529879337324e-07, "loss": 0.056, "num_tokens": 195327945.0, "reward": 1.1279296875, "reward_std": 0.19397452473640442, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06960996240377426, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1748.0, "completions/mean_length": 358.12890625, "completions/mean_terminated_length": 351.5019836425781, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.3024366945028305, "epoch": 0.11608057357459883, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1347420372554622, "learning_rate": 9.993239196124437e-07, "loss": -0.001, "num_tokens": 195583243.0, "reward": 1.0966796875, "reward_std": 0.11830782890319824, "rewards/accuracy_reward/mean": 0.10282257944345474, "rewards/accuracy_reward/std": 0.30403366684913635, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1892.0, "completions/mean_length": 448.568359375, "completions/mean_terminated_length": 397.0, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.27093032002449036, "epoch": 0.11642198702628884, "frac_reward_zero_std": 0.5, "grad_norm": 0.1379165173467897, "learning_rate": 9.992942130740775e-07, "loss": 0.0709, "num_tokens": 195890862.0, "reward": 1.15576171875, "reward_std": 0.18987607955932617, "rewards/accuracy_reward/mean": 0.181640625, "rewards/accuracy_reward/std": 0.38592514395713806, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97412109375, "rewards/tag_count_reward/std": 0.13340787589550018, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 453.7421875, "completions/mean_terminated_length": 378.7852783203125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.2699429765343666, "epoch": 0.11676340047797883, "frac_reward_zero_std": 0.46875, "grad_norm": 0.15615529700802028, "learning_rate": 9.99263868360829e-07, "loss": 0.1111, "num_tokens": 196196858.0, "reward": 1.078125, "reward_std": 0.19100454449653625, "rewards/accuracy_reward/mean": 0.11693548411130905, "rewards/accuracy_reward/std": 0.3216678202152252, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1563539206981659, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1554.0, "completions/mean_length": 340.376953125, "completions/mean_terminated_length": 337.03521728515625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.31898388266563416, "epoch": 0.11710481392966883, "frac_reward_zero_std": 0.75, "grad_norm": 0.12508337759432647, "learning_rate": 9.992328855157995e-07, "loss": 0.0264, "num_tokens": 196439867.0, "reward": 1.09228515625, "reward_std": 0.09113167226314545, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1942.0, "completions/mean_length": 426.056640625, "completions/mean_terminated_length": 413.2854309082031, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.27037739753723145, "epoch": 0.11744622738135882, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11536277562928242, "learning_rate": 9.992012645829967e-07, "loss": 0.083, "num_tokens": 196734072.0, "reward": 1.068359375, "reward_std": 0.1336260735988617, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.07294141501188278, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1540.0, "completions/mean_length": 366.18359375, "completions/mean_terminated_length": 362.8923645019531, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.30431725084781647, "epoch": 0.11778764083304882, "frac_reward_zero_std": 0.59375, "grad_norm": 0.15256673084704292, "learning_rate": 9.991690056073353e-07, "loss": 0.0243, "num_tokens": 197000710.0, "reward": 1.14892578125, "reward_std": 0.15473595261573792, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1642.0, "completions/mean_length": 447.283203125, "completions/mean_terminated_length": 421.8750305175781, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.2714405357837677, "epoch": 0.11812905428473881, "frac_reward_zero_std": 0.46875, "grad_norm": 0.15303284935413483, "learning_rate": 9.991361086346352e-07, "loss": 0.124, "num_tokens": 197312935.0, "reward": 1.11572265625, "reward_std": 0.1496298462152481, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98681640625, "rewards/tag_count_reward/std": 0.09224860370159149, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1408.0, "completions/mean_length": 354.259765625, "completions/mean_terminated_length": 334.1759033203125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.30228959769010544, "epoch": 0.11847046773642882, "frac_reward_zero_std": 0.65625, "grad_norm": 0.14367073389098045, "learning_rate": 9.991025737116235e-07, "loss": 0.078, "num_tokens": 197568364.0, "reward": 1.15625, "reward_std": 0.12649184465408325, "rewards/accuracy_reward/mean": 0.166015625, "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.990234375, "rewards/tag_count_reward/std": 0.08365631848573685, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1702.0, "completions/mean_length": 390.53125, "completions/mean_terminated_length": 367.55645751953125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.2930081710219383, "epoch": 0.1188118811881188, "frac_reward_zero_std": 0.375, "grad_norm": 0.18534420805877388, "learning_rate": 9.990684008859325e-07, "loss": 0.1186, "num_tokens": 197845756.0, "reward": 1.1884765625, "reward_std": 0.24251723289489746, "rewards/accuracy_reward/mean": 0.19921875, "rewards/accuracy_reward/std": 0.39980348944664, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9892578125, "rewards/tag_count_reward/std": 0.08781895041465759, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 388.140625, "completions/mean_terminated_length": 378.3575744628906, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.2796627804636955, "epoch": 0.11915329463980881, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13041897334109895, "learning_rate": 9.990335902061015e-07, "loss": 0.0257, "num_tokens": 198121700.0, "reward": 1.10009765625, "reward_std": 0.14379920065402985, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.059313252568244934, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1820.0, "completions/mean_length": 397.697265625, "completions/mean_terminated_length": 374.8218078613281, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.293277382850647, "epoch": 0.1194947080914988, "frac_reward_zero_std": 0.46875, "grad_norm": 0.16227701941956757, "learning_rate": 9.989981417215755e-07, "loss": 0.1153, "num_tokens": 198396137.0, "reward": 1.03173828125, "reward_std": 0.1477942168712616, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98876953125, "rewards/tag_count_reward/std": 0.08845183253288269, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 429.5703125, "completions/mean_terminated_length": 413.6094665527344, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.24435262382030487, "epoch": 0.1198361215431888, "frac_reward_zero_std": 0.5, "grad_norm": 0.14534170209667785, "learning_rate": 9.98962055482705e-07, "loss": 0.0526, "num_tokens": 198695293.0, "reward": 1.150390625, "reward_std": 0.20175957679748535, "rewards/accuracy_reward/mean": 0.16733871400356293, "rewards/accuracy_reward/std": 0.37365487217903137, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.08485843241214752, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1898.0, "completions/mean_length": 379.4453125, "completions/mean_terminated_length": 366.3070983886719, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.28834106773138046, "epoch": 0.12017753499487879, "frac_reward_zero_std": 0.625, "grad_norm": 0.1538990361663189, "learning_rate": 9.989253315407466e-07, "loss": 0.0711, "num_tokens": 198956689.0, "reward": 1.046875, "reward_std": 0.12485307455062866, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1358.0, "completions/mean_length": 379.3359375, "completions/mean_terminated_length": 366.19683837890625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.29046205431222916, "epoch": 0.1205189484465688, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14753488016123248, "learning_rate": 9.98887969947863e-07, "loss": 0.0515, "num_tokens": 199233757.0, "reward": 1.0703125, "reward_std": 0.13476485013961792, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1786.0, "completions/mean_length": 463.990234375, "completions/mean_terminated_length": 442.03369140625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2646271660923958, "epoch": 0.12086036189825879, "frac_reward_zero_std": 0.625, "grad_norm": 0.11362311997274527, "learning_rate": 9.988499707571226e-07, "loss": 0.0739, "num_tokens": 199552824.0, "reward": 1.08154296875, "reward_std": 0.1249900758266449, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98974609375, "rewards/tag_count_reward/std": 0.08286299556493759, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1830.0, "completions/mean_length": 428.650390625, "completions/mean_terminated_length": 399.6759338378906, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.26122232526540756, "epoch": 0.12120177534994879, "frac_reward_zero_std": 0.5, "grad_norm": 0.14045832850703877, "learning_rate": 9.988113340224986e-07, "loss": 0.0798, "num_tokens": 199847157.0, "reward": 1.14208984375, "reward_std": 0.1813819408416748, "rewards/accuracy_reward/mean": 0.16129031777381897, "rewards/accuracy_reward/std": 0.3681698739528656, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98583984375, "rewards/tag_count_reward/std": 0.10097216814756393, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 414.91796875, "completions/mean_terminated_length": 408.5137634277344, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.28861329704523087, "epoch": 0.12154318880163878, "frac_reward_zero_std": 0.625, "grad_norm": 0.1373200162426363, "learning_rate": 9.98772059798871e-07, "loss": 0.0198, "num_tokens": 200140187.0, "reward": 1.08642578125, "reward_std": 0.12792286276817322, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1488.0, "completions/mean_length": 440.341796875, "completions/mean_terminated_length": 430.8664245605469, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.26723769307136536, "epoch": 0.12188460225332878, "frac_reward_zero_std": 0.40625, "grad_norm": 0.15719514841398596, "learning_rate": 9.987321481420244e-07, "loss": 0.079, "num_tokens": 200441706.0, "reward": 1.14306640625, "reward_std": 0.19326218962669373, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.059313252568244934, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1368.0, "completions/mean_length": 393.708984375, "completions/mean_terminated_length": 380.6830749511719, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.28503791242837906, "epoch": 0.12222601570501877, "frac_reward_zero_std": 0.625, "grad_norm": 0.12368268821905706, "learning_rate": 9.98691599108649e-07, "loss": 0.0751, "num_tokens": 200734053.0, "reward": 1.15771484375, "reward_std": 0.12457266449928284, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.06697065383195877, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 424.1171875, "completions/mean_terminated_length": 401.6079406738281, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.28593524545431137, "epoch": 0.12256742915670878, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13029650254117173, "learning_rate": 9.986504127563407e-07, "loss": 0.1402, "num_tokens": 201033073.0, "reward": 1.0703125, "reward_std": 0.11578138172626495, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.986328125, "rewards/tag_count_reward/std": 0.09920720010995865, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1883.0, "completions/mean_length": 448.62109375, "completions/mean_terminated_length": 416.7609558105469, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.25212161242961884, "epoch": 0.12290884260839877, "frac_reward_zero_std": 0.625, "grad_norm": 0.10673708808178359, "learning_rate": 9.986085891436e-07, "loss": 0.1802, "num_tokens": 201345631.0, "reward": 1.01611328125, "reward_std": 0.09916087985038757, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98486328125, "rewards/tag_count_reward/std": 0.10440578311681747, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1565.0, "completions/mean_length": 397.025390625, "completions/mean_terminated_length": 387.2947082519531, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.28900285065174103, "epoch": 0.12325025606008877, "frac_reward_zero_std": 0.46875, "grad_norm": 0.14753285683527897, "learning_rate": 9.985661283298332e-07, "loss": 0.0446, "num_tokens": 201630732.0, "reward": 1.1494140625, "reward_std": 0.21098366379737854, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1482.0, "completions/mean_length": 396.28125, "completions/mean_terminated_length": 386.5461730957031, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.2883039340376854, "epoch": 0.12359166951177876, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12909361231956532, "learning_rate": 9.985230303753514e-07, "loss": 0.0166, "num_tokens": 201915980.0, "reward": 1.07763671875, "reward_std": 0.12312937527894974, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 417.099609375, "completions/mean_terminated_length": 410.7039489746094, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.30701597034931183, "epoch": 0.12393308296346876, "frac_reward_zero_std": 0.5625, "grad_norm": 0.15710285048246006, "learning_rate": 9.984792953413704e-07, "loss": 0.0253, "num_tokens": 202211935.0, "reward": 1.1220703125, "reward_std": 0.15250445902347565, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9736328125, "rewards/tag_count_reward/std": 0.13826605677604675, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1511.0, "completions/mean_length": 430.203125, "completions/mean_terminated_length": 417.4645690917969, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.2738180235028267, "epoch": 0.12427449641515875, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12071786200952667, "learning_rate": 9.984349232900116e-07, "loss": 0.0553, "num_tokens": 202499687.0, "reward": 1.078125, "reward_std": 0.11202438175678253, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2018.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 389.1640625, "completions/mean_terminated_length": 389.1640625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.27628692984580994, "epoch": 0.12461590986684876, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13983206596666095, "learning_rate": 9.983899142843003e-07, "loss": -0.0087, "num_tokens": 202774731.0, "reward": 1.142578125, "reward_std": 0.16886216402053833, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 394.0546875, "completions/mean_terminated_length": 384.3064880371094, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.2971241921186447, "epoch": 0.12495732331853875, "frac_reward_zero_std": 0.53125, "grad_norm": 0.14163120661220682, "learning_rate": 9.983442683881674e-07, "loss": 0.0453, "num_tokens": 203053287.0, "reward": 1.10888671875, "reward_std": 0.17165344953536987, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1750.0, "completions/mean_length": 407.48828125, "completions/mean_terminated_length": 391.3096618652344, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.2717456519603729, "epoch": 0.12529873677022874, "frac_reward_zero_std": 0.5625, "grad_norm": 0.15953656931454244, "learning_rate": 9.98297985666448e-07, "loss": 0.0768, "num_tokens": 203339281.0, "reward": 1.11181640625, "reward_std": 0.1669941544532776, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1193.0, "completions/mean_length": 400.037109375, "completions/mean_terminated_length": 387.06103515625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2613164149224758, "epoch": 0.12564015022191874, "frac_reward_zero_std": 0.53125, "grad_norm": 0.14462807352850793, "learning_rate": 9.982510661848819e-07, "loss": 0.0635, "num_tokens": 203616420.0, "reward": 1.16015625, "reward_std": 0.20131567120552063, "rewards/accuracy_reward/mean": 0.166015625, "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 424.072265625, "completions/mean_terminated_length": 398.295654296875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.257813036441803, "epoch": 0.12598156367360874, "frac_reward_zero_std": 0.40625, "grad_norm": 0.15777318808536156, "learning_rate": 9.98203510010113e-07, "loss": 0.0709, "num_tokens": 203906089.0, "reward": 1.134765625, "reward_std": 0.22579026222229004, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.09310565888881683, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1761.0, "completions/mean_length": 447.19921875, "completions/mean_terminated_length": 431.4122314453125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.22951560467481613, "epoch": 0.12632297712529875, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1212377876793294, "learning_rate": 9.981553172096898e-07, "loss": 0.0418, "num_tokens": 204211007.0, "reward": 1.14892578125, "reward_std": 0.19588682055473328, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.36344730854034424, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1325.0, "completions/mean_length": 394.9375, "completions/mean_terminated_length": 385.19451904296875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.28569696843624115, "epoch": 0.12666439057698872, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1381300538730963, "learning_rate": 9.981064878520655e-07, "loss": 0.0279, "num_tokens": 204489871.0, "reward": 1.08544921875, "reward_std": 0.14951488375663757, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1566.0, "completions/mean_length": 402.966796875, "completions/mean_terminated_length": 383.4604797363281, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.2946750223636627, "epoch": 0.12700580402867873, "frac_reward_zero_std": 0.65625, "grad_norm": 0.13246193164729103, "learning_rate": 9.980570220065969e-07, "loss": 0.0682, "num_tokens": 204787214.0, "reward": 1.0361328125, "reward_std": 0.11059455573558807, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.08079168945550919, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1621.0, "completions/mean_length": 405.4375, "completions/mean_terminated_length": 395.75640869140625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.26182946190238, "epoch": 0.12734721748036873, "frac_reward_zero_std": 0.40625, "grad_norm": 0.14857915434920055, "learning_rate": 9.980069197435444e-07, "loss": 0.0289, "num_tokens": 205069422.0, "reward": 1.1787109375, "reward_std": 0.2348119616508484, "rewards/accuracy_reward/mean": 0.18359375, "rewards/accuracy_reward/std": 0.3875311613082886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1631.0, "completions/mean_length": 416.74609375, "completions/mean_terminated_length": 407.13165283203125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.25877176970243454, "epoch": 0.12768863093205873, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1658700300273315, "learning_rate": 9.979561811340737e-07, "loss": 0.0283, "num_tokens": 205357004.0, "reward": 1.14013671875, "reward_std": 0.1742173135280609, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1705.0, "completions/mean_length": 367.4609375, "completions/mean_terminated_length": 360.87060546875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.2931177392601967, "epoch": 0.1280300443837487, "frac_reward_zero_std": 0.40625, "grad_norm": 0.18436217308762842, "learning_rate": 9.979048062502532e-07, "loss": 0.0181, "num_tokens": 205627384.0, "reward": 1.10693359375, "reward_std": 0.1810162365436554, "rewards/accuracy_reward/mean": 0.11693548411130905, "rewards/accuracy_reward/std": 0.3216678202152252, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.05921651050448418, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1711.0, "completions/mean_length": 452.7109375, "completions/mean_terminated_length": 430.5980224609375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.23108897730708122, "epoch": 0.1283714578354387, "frac_reward_zero_std": 0.46875, "grad_norm": 0.13043105489677048, "learning_rate": 9.978527951650558e-07, "loss": 0.1226, "num_tokens": 205939652.0, "reward": 1.080078125, "reward_std": 0.16918101906776428, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.08907753974199295, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1749.0, "completions/mean_length": 389.62109375, "completions/mean_terminated_length": 386.375732421875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.24588216468691826, "epoch": 0.12871287128712872, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12394162936355425, "learning_rate": 9.978001479523573e-07, "loss": 0.0063, "num_tokens": 206206786.0, "reward": 1.08203125, "reward_std": 0.10409127175807953, "rewards/accuracy_reward/mean": 0.08669354766607285, "rewards/accuracy_reward/std": 0.281669557094574, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1839.0, "completions/mean_length": 388.759765625, "completions/mean_terminated_length": 378.9803771972656, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.2749894857406616, "epoch": 0.12905428473881872, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11504938202488121, "learning_rate": 9.97746864686938e-07, "loss": 0.0551, "num_tokens": 206475303.0, "reward": 1.06787109375, "reward_std": 0.1098242998123169, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1269.0, "completions/mean_length": 397.94140625, "completions/mean_terminated_length": 388.21612548828125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.2553471699357033, "epoch": 0.1293956981905087, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11683823154541141, "learning_rate": 9.976929454444809e-07, "loss": 0.0677, "num_tokens": 206753945.0, "reward": 1.103515625, "reward_std": 0.13129165768623352, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.062285590916872025, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1605.0, "completions/mean_length": 419.189453125, "completions/mean_terminated_length": 409.58941650390625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.2531426101922989, "epoch": 0.1297371116421987, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12226887152296481, "learning_rate": 9.976383903015724e-07, "loss": 0.0667, "num_tokens": 207044154.0, "reward": 1.1357421875, "reward_std": 0.15744519233703613, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1936.0, "completions/mean_length": 442.78125, "completions/mean_terminated_length": 407.53692626953125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2526315227150917, "epoch": 0.1300785250938887, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11109770583750916, "learning_rate": 9.975831993357026e-07, "loss": 0.1522, "num_tokens": 207348602.0, "reward": 1.09033203125, "reward_std": 0.10148291289806366, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98291015625, "rewards/tag_count_reward/std": 0.11092886328697205, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1570.0, "completions/mean_length": 435.826171875, "completions/mean_terminated_length": 426.3241882324219, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.23951656743884087, "epoch": 0.1304199385455787, "frac_reward_zero_std": 0.625, "grad_norm": 0.11020996352229771, "learning_rate": 9.975273726252644e-07, "loss": 0.0606, "num_tokens": 207645521.0, "reward": 1.07275390625, "reward_std": 0.12763206660747528, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 456.447265625, "completions/mean_terminated_length": 440.7514953613281, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2395961470901966, "epoch": 0.13076135199726868, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1533124751434918, "learning_rate": 9.974709102495536e-07, "loss": 0.0351, "num_tokens": 207958198.0, "reward": 1.10107421875, "reward_std": 0.18313108384609222, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97021484375, "rewards/tag_count_reward/std": 0.1432313472032547, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 414.37109375, "completions/mean_terminated_length": 395.0000305175781, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.2551584467291832, "epoch": 0.13110276544895869, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13697800951204817, "learning_rate": 9.974138122887689e-07, "loss": 0.0919, "num_tokens": 208242324.0, "reward": 1.1064453125, "reward_std": 0.15431207418441772, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.08079168945550919, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2047.0, "completions/max_terminated_length": 1911.0, "completions/mean_length": 423.4609375, "completions/mean_terminated_length": 377.81927490234375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.28544455021619797, "epoch": 0.1314441789006487, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14224860192778793, "learning_rate": 9.973560788240122e-07, "loss": 0.0175, "num_tokens": 208543664.0, "reward": 1.0693359375, "reward_std": 0.13196125626564026, "rewards/accuracy_reward/mean": 0.0947580635547638, "rewards/accuracy_reward/std": 0.29317617416381836, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9775390625, "rewards/tag_count_reward/std": 0.1260315477848053, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1741.0, "completions/mean_length": 385.763671875, "completions/mean_terminated_length": 382.5107727050781, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.27001994103193283, "epoch": 0.1317855923523387, "frac_reward_zero_std": 0.625, "grad_norm": 0.12191994392411408, "learning_rate": 9.972977099372877e-07, "loss": 0.0086, "num_tokens": 208811863.0, "reward": 1.064453125, "reward_std": 0.1194448471069336, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1633.0, "completions/mean_length": 398.1875, "completions/mean_terminated_length": 385.19683837890625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.2921775206923485, "epoch": 0.13212700580402867, "frac_reward_zero_std": 0.59375, "grad_norm": 0.15644456212500396, "learning_rate": 9.972387057115022e-07, "loss": 0.0807, "num_tokens": 209097655.0, "reward": 1.02587890625, "reward_std": 0.10711659491062164, "rewards/accuracy_reward/mean": 0.033203125, "rewards/accuracy_reward/std": 0.17934183776378632, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.0704338550567627, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1671.0, "completions/mean_length": 421.107421875, "completions/mean_terminated_length": 405.06707763671875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.259803868830204, "epoch": 0.13246841925571867, "frac_reward_zero_std": 0.65625, "grad_norm": 0.13638590549618956, "learning_rate": 9.97179066230465e-07, "loss": 0.044, "num_tokens": 209401966.0, "reward": 1.08349609375, "reward_std": 0.1178620457649231, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99169921875, "rewards/tag_count_reward/std": 0.07696826010942459, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1829.0, "completions/mean_length": 375.88671875, "completions/mean_terminated_length": 372.6144714355469, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.26465272903442383, "epoch": 0.13280983270740868, "frac_reward_zero_std": 0.78125, "grad_norm": 0.10286597326682936, "learning_rate": 9.971187915788875e-07, "loss": 0.0335, "num_tokens": 209674516.0, "reward": 1.091796875, "reward_std": 0.07326269149780273, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.051725368946790695, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1434.0, "completions/mean_length": 402.671875, "completions/mean_terminated_length": 392.9744873046875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.24010789394378662, "epoch": 0.13315124615909868, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1265205629001631, "learning_rate": 9.970578818423837e-07, "loss": 0.0727, "num_tokens": 209957868.0, "reward": 1.12451171875, "reward_std": 0.1546522080898285, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1274.0, "completions/mean_length": 376.24609375, "completions/mean_terminated_length": 359.7593688964844, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.26325681060552597, "epoch": 0.13349265961078866, "frac_reward_zero_std": 0.5, "grad_norm": 0.17174841197025734, "learning_rate": 9.969963371074693e-07, "loss": 0.0834, "num_tokens": 210229402.0, "reward": 1.09423828125, "reward_std": 0.16704586148262024, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1597.0, "completions/mean_length": 390.142578125, "completions/mean_terminated_length": 377.0885925292969, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2688804008066654, "epoch": 0.13383407306247866, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13403703834555217, "learning_rate": 9.96934157461562e-07, "loss": 0.0841, "num_tokens": 210502531.0, "reward": 1.0732421875, "reward_std": 0.13266593217849731, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06960996240377426, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1157.0, "completions/mean_length": 379.56640625, "completions/mean_terminated_length": 353.0833435058594, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.2548094019293785, "epoch": 0.13417548651416866, "frac_reward_zero_std": 0.53125, "grad_norm": 0.13539912729442807, "learning_rate": 9.968713429929818e-07, "loss": 0.0813, "num_tokens": 210772293.0, "reward": 1.14599609375, "reward_std": 0.1686253696680069, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98779296875, "rewards/tag_count_reward/std": 0.09369774907827377, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1588.0, "completions/mean_length": 381.578125, "completions/mean_terminated_length": 371.75640869140625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2595689781010151, "epoch": 0.13451689996585867, "frac_reward_zero_std": 0.75, "grad_norm": 0.0977177298483914, "learning_rate": 9.968078937909493e-07, "loss": 0.0729, "num_tokens": 211037389.0, "reward": 1.0322265625, "reward_std": 0.07294808328151703, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1867.0, "completions/mean_length": 383.03515625, "completions/mean_terminated_length": 373.2220153808594, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.26100823283195496, "epoch": 0.13485831341754864, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1794070879109447, "learning_rate": 9.967438099455881e-07, "loss": 0.0291, "num_tokens": 211303071.0, "reward": 1.21337890625, "reward_std": 0.18684610724449158, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41380295157432556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.06134068965911865, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 409.587890625, "completions/mean_terminated_length": 403.16278076171875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.2567984461784363, "epoch": 0.13519972686923865, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09408836539481523, "learning_rate": 9.96679091547922e-07, "loss": 0.0056, "num_tokens": 211593164.0, "reward": 1.1337890625, "reward_std": 0.1235247477889061, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1960.0, "completions/mean_length": 412.34375, "completions/mean_terminated_length": 399.4645690917969, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.26471738517284393, "epoch": 0.13554114032092865, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1607279536013307, "learning_rate": 9.96613738689877e-07, "loss": 0.0869, "num_tokens": 211875388.0, "reward": 1.12158203125, "reward_std": 0.1784282624721527, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1206.0, "completions/mean_length": 360.025390625, "completions/mean_terminated_length": 350.0766296386719, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.28096191585063934, "epoch": 0.13588255377261865, "frac_reward_zero_std": 0.53125, "grad_norm": 0.14161513733301276, "learning_rate": 9.965477514642797e-07, "loss": 0.0484, "num_tokens": 212133497.0, "reward": 1.12841796875, "reward_std": 0.1555912047624588, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1745.0, "completions/mean_length": 446.662109375, "completions/mean_terminated_length": 437.2239990234375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.22882525250315666, "epoch": 0.13622396722430863, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1449006492106089, "learning_rate": 9.964811299648581e-07, "loss": 0.005, "num_tokens": 212444204.0, "reward": 1.1181640625, "reward_std": 0.20377370715141296, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1884.0, "completions/mean_length": 401.951171875, "completions/mean_terminated_length": 388.9901428222656, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.2544676698744297, "epoch": 0.13656538067599863, "frac_reward_zero_std": 0.59375, "grad_norm": 0.16205598627873904, "learning_rate": 9.964138742862408e-07, "loss": 0.0938, "num_tokens": 212734259.0, "reward": 1.10595703125, "reward_std": 0.1586868166923523, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1931.0, "completions/mean_length": 441.634765625, "completions/mean_terminated_length": 393.17303466796875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.26745621860027313, "epoch": 0.13690679412768864, "frac_reward_zero_std": 0.5, "grad_norm": 0.15269953210459533, "learning_rate": 9.963459845239579e-07, "loss": 0.0847, "num_tokens": 213044424.0, "reward": 1.13232421875, "reward_std": 0.15922270715236664, "rewards/accuracy_reward/mean": 0.162109375, "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97021484375, "rewards/tag_count_reward/std": 0.1397739052772522, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1411.0, "completions/mean_length": 428.302734375, "completions/mean_terminated_length": 421.9510192871094, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.21955596283078194, "epoch": 0.13724820757937864, "frac_reward_zero_std": 0.53125, "grad_norm": 0.13301925224922548, "learning_rate": 9.962774607744387e-07, "loss": 0.0292, "num_tokens": 213328499.0, "reward": 1.076171875, "reward_std": 0.1476464569568634, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.051725368946790695, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1968.0, "completions/mean_length": 428.431640625, "completions/mean_terminated_length": 399.4671936035156, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.2692967876791954, "epoch": 0.13758962103106862, "frac_reward_zero_std": 0.4375, "grad_norm": 0.15217346164364137, "learning_rate": 9.962083031350148e-07, "loss": 0.0341, "num_tokens": 213632064.0, "reward": 1.1396484375, "reward_std": 0.2021113485097885, "rewards/accuracy_reward/mean": 0.162109375, "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9775390625, "rewards/tag_count_reward/std": 0.11697173118591309, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1495.0, "completions/mean_length": 452.478515625, "completions/mean_terminated_length": 439.91534423828125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.21951111406087875, "epoch": 0.13793103448275862, "frac_reward_zero_std": 0.5, "grad_norm": 0.12554547538762578, "learning_rate": 9.961385117039167e-07, "loss": 0.0697, "num_tokens": 213940037.0, "reward": 1.1357421875, "reward_std": 0.16819798946380615, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06960996240377426, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1837.0, "completions/mean_length": 430.84765625, "completions/mean_terminated_length": 424.50592041015625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.24757782369852066, "epoch": 0.13827244793444862, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11036857279190077, "learning_rate": 9.960680865802762e-07, "loss": 0.0268, "num_tokens": 214237095.0, "reward": 1.09814453125, "reward_std": 0.1412362903356552, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1807.0, "completions/mean_length": 413.515625, "completions/mean_terminated_length": 400.6456604003906, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 0.2470545694231987, "epoch": 0.13861386138613863, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13067882954541574, "learning_rate": 9.959970278641246e-07, "loss": 0.0469, "num_tokens": 214522735.0, "reward": 1.1396484375, "reward_std": 0.13197806477546692, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.0678301453590393, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1873.0, "completions/mean_length": 470.48046875, "completions/mean_terminated_length": 372.35272216796875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.2804912030696869, "epoch": 0.1389552748378286, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14500340364969097, "learning_rate": 9.959253356563931e-07, "loss": 0.0357, "num_tokens": 214850549.0, "reward": 1.04541015625, "reward_std": 0.12958434224128723, "rewards/accuracy_reward/mean": 0.0947580635547638, "rewards/accuracy_reward/std": 0.29317617416381836, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95361328125, "rewards/tag_count_reward/std": 0.17323793470859528, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 483.587890625, "completions/mean_terminated_length": 468.1597595214844, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.2514961436390877, "epoch": 0.1392966882895186, "frac_reward_zero_std": 0.625, "grad_norm": 0.10351050357701702, "learning_rate": 9.958530100589131e-07, "loss": 0.0965, "num_tokens": 215185122.0, "reward": 1.0283203125, "reward_std": 0.09347065538167953, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9697265625, "rewards/tag_count_reward/std": 0.14692385494709015, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1531.0, "completions/max_terminated_length": 1531.0, "completions/mean_length": 378.921875, "completions/mean_terminated_length": 378.921875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.2687805965542793, "epoch": 0.1396381017412086, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1308137895582865, "learning_rate": 9.957800511744153e-07, "loss": -0.0064, "num_tokens": 215455898.0, "reward": 1.0546875, "reward_std": 0.0994751825928688, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1723.0, "completions/mean_length": 428.048828125, "completions/mean_terminated_length": 395.7789001464844, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.2632024884223938, "epoch": 0.1399795151928986, "frac_reward_zero_std": 0.625, "grad_norm": 0.14747905166559663, "learning_rate": 9.957064591065301e-07, "loss": 0.1604, "num_tokens": 215765843.0, "reward": 1.029296875, "reward_std": 0.10419270396232605, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.0862877294421196, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1415.0, "completions/mean_length": 413.357421875, "completions/mean_terminated_length": 403.7229919433594, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.2519109696149826, "epoch": 0.1403209286445886, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1424263027560086, "learning_rate": 9.956322339597874e-07, "loss": 0.025, "num_tokens": 216054362.0, "reward": 1.18115234375, "reward_std": 0.16768009960651398, "rewards/accuracy_reward/mean": 0.185546875, "rewards/accuracy_reward/std": 0.38912075757980347, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1792.0, "completions/mean_length": 413.99609375, "completions/mean_terminated_length": 394.6205749511719, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.26123039424419403, "epoch": 0.1406623420962786, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12313850676218813, "learning_rate": 9.955573758396162e-07, "loss": 0.0865, "num_tokens": 216343560.0, "reward": 1.11181640625, "reward_std": 0.10014599561691284, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310528099536896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99072265625, "rewards/tag_count_reward/std": 0.0814909115433693, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1695.0, "completions/mean_length": 379.5703125, "completions/mean_terminated_length": 366.4330749511719, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.2694252207875252, "epoch": 0.1410037555479686, "frac_reward_zero_std": 0.625, "grad_norm": 0.10790004357198897, "learning_rate": 9.954818848523442e-07, "loss": 0.0331, "num_tokens": 216606748.0, "reward": 1.14013671875, "reward_std": 0.1284443736076355, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.06697065383195877, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1409.0, "completions/mean_length": 381.62109375, "completions/mean_terminated_length": 375.0863037109375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.2635764479637146, "epoch": 0.1413451689996586, "frac_reward_zero_std": 0.625, "grad_norm": 0.11588811930026578, "learning_rate": 9.954057611051986e-07, "loss": 0.0373, "num_tokens": 216889658.0, "reward": 1.1044921875, "reward_std": 0.1498107612133026, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1520.0, "completions/mean_length": 411.48046875, "completions/mean_terminated_length": 385.5039978027344, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2635978162288666, "epoch": 0.14168658245134857, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1495163933573305, "learning_rate": 9.953290047063054e-07, "loss": 0.1111, "num_tokens": 217174336.0, "reward": 1.0771484375, "reward_std": 0.16440929472446442, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9873046875, "rewards/tag_count_reward/std": 0.09165237098932266, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 404.357421875, "completions/mean_terminated_length": 384.86761474609375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.26211829483509064, "epoch": 0.14202799590303858, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12694433357071258, "learning_rate": 9.952516157646884e-07, "loss": 0.0195, "num_tokens": 217459095.0, "reward": 1.1611328125, "reward_std": 0.15174177289009094, "rewards/accuracy_reward/mean": 0.169921875, "rewards/accuracy_reward/std": 0.3759314715862274, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.08079168945550919, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1592.0, "completions/mean_length": 461.99609375, "completions/mean_terminated_length": 440.01190185546875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.2296997681260109, "epoch": 0.14236940935472858, "frac_reward_zero_std": 0.34375, "grad_norm": 0.15827239887682334, "learning_rate": 9.951735943902704e-07, "loss": 0.1139, "num_tokens": 217767461.0, "reward": 1.19873046875, "reward_std": 0.23495472967624664, "rewards/accuracy_reward/mean": 0.208984375, "rewards/accuracy_reward/std": 0.40698084235191345, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98974609375, "rewards/tag_count_reward/std": 0.08717872947454453, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 426.107421875, "completions/mean_terminated_length": 406.8755187988281, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2563815116882324, "epoch": 0.14271082280641859, "frac_reward_zero_std": 0.625, "grad_norm": 0.1162179028588284, "learning_rate": 9.95094940693873e-07, "loss": 0.0521, "num_tokens": 218061436.0, "reward": 1.0244140625, "reward_std": 0.10868444293737411, "rewards/accuracy_reward/mean": 0.033203125, "rewards/accuracy_reward/std": 0.17934183776378632, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.08079168945550919, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1755.0, "completions/mean_length": 420.6484375, "completions/mean_terminated_length": 414.2666931152344, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.23153818398714066, "epoch": 0.14305223625810856, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11973770705009551, "learning_rate": 9.95015654787215e-07, "loss": 0.0457, "num_tokens": 218348872.0, "reward": 1.1396484375, "reward_std": 0.14300721883773804, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1536.0, "completions/mean_length": 406.505859375, "completions/mean_terminated_length": 393.5807189941406, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.29150231182575226, "epoch": 0.14339364970979857, "frac_reward_zero_std": 0.65625, "grad_norm": 0.18787418884587784, "learning_rate": 9.94935736782914e-07, "loss": 0.0692, "num_tokens": 218641451.0, "reward": 1.041015625, "reward_std": 0.11325598508119583, "rewards/accuracy_reward/mean": 0.04838709533214569, "rewards/accuracy_reward/std": 0.21479946374893188, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1828.0, "completions/mean_length": 456.68359375, "completions/mean_terminated_length": 434.6257629394531, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.2323227971792221, "epoch": 0.14373506316148857, "frac_reward_zero_std": 0.5, "grad_norm": 0.13400092162032476, "learning_rate": 9.948551867944848e-07, "loss": 0.0788, "num_tokens": 218960073.0, "reward": 1.13818359375, "reward_std": 0.18036231398582458, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98974609375, "rewards/tag_count_reward/std": 0.08717872947454453, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1475.0, "completions/mean_length": 428.15625, "completions/mean_terminated_length": 412.18145751953125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.2647799775004387, "epoch": 0.14407647661317857, "frac_reward_zero_std": 0.53125, "grad_norm": 0.15065221425928269, "learning_rate": 9.947740049363404e-07, "loss": 0.0744, "num_tokens": 219257385.0, "reward": 1.099609375, "reward_std": 0.1599734127521515, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.07459938526153564, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1798.0, "completions/mean_length": 407.62109375, "completions/mean_terminated_length": 394.7047119140625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.2624846249818802, "epoch": 0.14441789006486855, "frac_reward_zero_std": 0.53125, "grad_norm": 0.13703253199884963, "learning_rate": 9.946921913237908e-07, "loss": 0.0317, "num_tokens": 219544023.0, "reward": 1.15380859375, "reward_std": 0.17762120068073273, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.06697065383195877, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1518.0, "completions/mean_length": 447.009765625, "completions/mean_terminated_length": 440.73138427734375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.24630200117826462, "epoch": 0.14475930351655855, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11990683346803975, "learning_rate": 9.946097460730436e-07, "loss": 0.0316, "num_tokens": 219849020.0, "reward": 1.1435546875, "reward_std": 0.17272081971168518, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 417.421875, "completions/mean_terminated_length": 404.5826721191406, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.2784285694360733, "epoch": 0.14510071696824856, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13877043825696986, "learning_rate": 9.945266693012037e-07, "loss": 0.0945, "num_tokens": 220146980.0, "reward": 1.0478515625, "reward_std": 0.12719687819480896, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06960996240377426, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1768.0, "completions/mean_length": 413.869140625, "completions/mean_terminated_length": 401.001953125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.26806990057229996, "epoch": 0.14544213041993856, "frac_reward_zero_std": 0.625, "grad_norm": 0.12408727961486789, "learning_rate": 9.944429611262728e-07, "loss": 0.0788, "num_tokens": 220434465.0, "reward": 1.12451171875, "reward_std": 0.13736669719219208, "rewards/accuracy_reward/mean": 0.13508065044879913, "rewards/accuracy_reward/std": 0.3421548008918762, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.06697065383195877, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1584.0, "completions/mean_length": 425.7890625, "completions/mean_terminated_length": 413.0157470703125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.24387053400278091, "epoch": 0.14578354387162853, "frac_reward_zero_std": 0.59375, "grad_norm": 0.9482338641287162, "learning_rate": 9.943586216671493e-07, "loss": 0.0736, "num_tokens": 220731029.0, "reward": 1.138671875, "reward_std": 0.14276140928268433, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1857.0, "completions/mean_length": 418.26953125, "completions/mean_terminated_length": 411.8784484863281, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2377127781510353, "epoch": 0.14612495732331854, "frac_reward_zero_std": 0.46875, "grad_norm": 0.14240017115408482, "learning_rate": 9.942736510436285e-07, "loss": 0.0164, "num_tokens": 221015983.0, "reward": 1.19482421875, "reward_std": 0.1869402825832367, "rewards/accuracy_reward/mean": 0.19921875, "rewards/accuracy_reward/std": 0.39980348944664, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1912.0, "completions/mean_length": 420.669921875, "completions/mean_terminated_length": 401.37353515625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.2315872572362423, "epoch": 0.14646637077500854, "frac_reward_zero_std": 0.625, "grad_norm": 0.10126640906884643, "learning_rate": 9.941880493764027e-07, "loss": 0.0282, "num_tokens": 221310550.0, "reward": 1.07470703125, "reward_std": 0.1323384940624237, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99072265625, "rewards/tag_count_reward/std": 0.0814909115433693, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1952.0, "completions/mean_length": 433.869140625, "completions/mean_terminated_length": 381.82861328125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.29446088522672653, "epoch": 0.14680778422669855, "frac_reward_zero_std": 0.53125, "grad_norm": 0.15411697336826874, "learning_rate": 9.941018167870596e-07, "loss": 0.0694, "num_tokens": 221619283.0, "reward": 1.08984375, "reward_std": 0.17755842208862305, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.974609375, "rewards/tag_count_reward/std": 0.13395914435386658, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1352.0, "completions/mean_length": 428.5234375, "completions/mean_terminated_length": 415.7716369628906, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.2590852454304695, "epoch": 0.14714919767838852, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12056106603120102, "learning_rate": 9.94014953398083e-07, "loss": 0.0714, "num_tokens": 221920799.0, "reward": 1.158203125, "reward_std": 0.11609946191310883, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1907.0, "completions/mean_length": 457.189453125, "completions/mean_terminated_length": 399.25506591796875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.280024491250515, "epoch": 0.14749061113007852, "frac_reward_zero_std": 0.625, "grad_norm": 0.11902161596747701, "learning_rate": 9.939274593328542e-07, "loss": 0.0623, "num_tokens": 222240496.0, "reward": 1.0693359375, "reward_std": 0.11843875050544739, "rewards/accuracy_reward/mean": 0.10080645233392715, "rewards/accuracy_reward/std": 0.30137622356414795, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9716796875, "rewards/tag_count_reward/std": 0.14138229191303253, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 407.474609375, "completions/mean_terminated_length": 397.8055114746094, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.27598991245031357, "epoch": 0.14783202458176853, "frac_reward_zero_std": 0.5625, "grad_norm": 0.16514471721002452, "learning_rate": 9.938393347156485e-07, "loss": 0.0593, "num_tokens": 222531811.0, "reward": 1.13623046875, "reward_std": 0.13937757909297943, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1834.0, "completions/mean_length": 457.115234375, "completions/mean_terminated_length": 441.4260559082031, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.23672433197498322, "epoch": 0.1481734380334585, "frac_reward_zero_std": 0.625, "grad_norm": 0.11109500523376603, "learning_rate": 9.93750579671638e-07, "loss": 0.0306, "num_tokens": 222842814.0, "reward": 1.09033203125, "reward_std": 0.13874046504497528, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 436.62890625, "completions/mean_terminated_length": 427.13165283203125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2294909954071045, "epoch": 0.1485148514851485, "frac_reward_zero_std": 0.625, "grad_norm": 0.11954189523814086, "learning_rate": 9.936611943268895e-07, "loss": 0.0012, "num_tokens": 223145712.0, "reward": 1.09521484375, "reward_std": 0.1130741685628891, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05285605043172836, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1852.0, "completions/mean_length": 426.396484375, "completions/mean_terminated_length": 416.83892822265625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.2501469515264034, "epoch": 0.1488562649368385, "frac_reward_zero_std": 0.53125, "grad_norm": 0.14173377213664914, "learning_rate": 9.935711788083654e-07, "loss": 0.0439, "num_tokens": 223441659.0, "reward": 1.16943359375, "reward_std": 0.17362506687641144, "rewards/accuracy_reward/mean": 0.173828125, "rewards/accuracy_reward/std": 0.3793322443962097, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1296.0, "completions/mean_length": 393.54296875, "completions/mean_terminated_length": 390.3052673339844, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.25969694554805756, "epoch": 0.14919767838852852, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13607284799860345, "learning_rate": 9.934805332439238e-07, "loss": 0.0193, "num_tokens": 223725089.0, "reward": 1.15283203125, "reward_std": 0.1507771611213684, "rewards/accuracy_reward/mean": 0.15927419066429138, "rewards/accuracy_reward/std": 0.3663010001182556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2047.0, "completions/max_terminated_length": 1270.0, "completions/mean_length": 415.87890625, "completions/mean_terminated_length": 363.2620849609375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.2933284565806389, "epoch": 0.1495390918402185, "frac_reward_zero_std": 0.6875, "grad_norm": 0.14131541715337703, "learning_rate": 9.933892577623165e-07, "loss": -0.0122, "num_tokens": 224017923.0, "reward": 1.07177734375, "reward_std": 0.10532590746879578, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97607421875, "rewards/tag_count_reward/std": 0.13100102543830872, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1370.0, "completions/mean_length": 400.1171875, "completions/mean_terminated_length": 396.8923645019531, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.2584180012345314, "epoch": 0.1498805052919085, "frac_reward_zero_std": 0.6875, "grad_norm": 0.13933838226956075, "learning_rate": 9.932973524931909e-07, "loss": 0.0307, "num_tokens": 224293887.0, "reward": 1.0478515625, "reward_std": 0.10268203169107437, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1715.0, "completions/mean_length": 399.20703125, "completions/mean_terminated_length": 392.7412109375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.2649410292506218, "epoch": 0.1502219187435985, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12530811715958223, "learning_rate": 9.932048175670886e-07, "loss": 0.0348, "num_tokens": 224579289.0, "reward": 1.1416015625, "reward_std": 0.10754109174013138, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1803.0, "completions/mean_length": 406.201171875, "completions/mean_terminated_length": 396.5245666503906, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.2664489597082138, "epoch": 0.1505633321952885, "frac_reward_zero_std": 0.5625, "grad_norm": 0.15270763216434446, "learning_rate": 9.931116531154458e-07, "loss": 0.0123, "num_tokens": 224855392.0, "reward": 1.11767578125, "reward_std": 0.14485642313957214, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.06134068965911865, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1375.0, "completions/mean_length": 455.1171875, "completions/mean_terminated_length": 397.1072998046875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.20108620077371597, "epoch": 0.15090474564697848, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12665326656535025, "learning_rate": 9.930178592705929e-07, "loss": 0.0415, "num_tokens": 225166684.0, "reward": 1.1748046875, "reward_std": 0.1628543734550476, "rewards/accuracy_reward/mean": 0.20766128599643707, "rewards/accuracy_reward/std": 0.4060424566268921, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9736328125, "rewards/tag_count_reward/std": 0.13648539781570435, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1626.0, "completions/mean_length": 404.904296875, "completions/mean_terminated_length": 401.6888427734375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.2502869591116905, "epoch": 0.15124615909866848, "frac_reward_zero_std": 0.59375, "grad_norm": 0.142594788432444, "learning_rate": 9.92923436165754e-07, "loss": 0.017, "num_tokens": 225456299.0, "reward": 1.0986328125, "reward_std": 0.13303008675575256, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1169.0, "completions/mean_length": 403.09375, "completions/mean_terminated_length": 399.874755859375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.24036934599280357, "epoch": 0.15158757255035848, "frac_reward_zero_std": 0.5625, "grad_norm": 0.14396050050879192, "learning_rate": 9.928283839350469e-07, "loss": -0.0031, "num_tokens": 225740891.0, "reward": 1.10498046875, "reward_std": 0.16954049468040466, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.039800092577934265, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1194.0, "completions/mean_length": 436.00390625, "completions/mean_terminated_length": 432.84930419921875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.2512326203286648, "epoch": 0.1519289860020485, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11634473337223397, "learning_rate": 9.927327027134833e-07, "loss": 0.009, "num_tokens": 226041133.0, "reward": 1.10009765625, "reward_std": 0.1239260733127594, "rewards/accuracy_reward/mean": 0.10483870655298233, "rewards/accuracy_reward/std": 0.30665475130081177, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1843.0, "completions/mean_length": 444.533203125, "completions/mean_terminated_length": 431.907470703125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.2512010708451271, "epoch": 0.15227039945373846, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13984083961339022, "learning_rate": 9.926363926369685e-07, "loss": 0.046, "num_tokens": 226343758.0, "reward": 1.068359375, "reward_std": 0.13671942055225372, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 436.84375, "completions/mean_terminated_length": 414.5108947753906, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2500085085630417, "epoch": 0.15261181290542847, "frac_reward_zero_std": 0.53125, "grad_norm": 0.16825647060284474, "learning_rate": 9.925394538423005e-07, "loss": 0.1395, "num_tokens": 226639278.0, "reward": 1.07080078125, "reward_std": 0.14505434036254883, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99072265625, "rewards/tag_count_reward/std": 0.0814909115433693, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 483.748046875, "completions/mean_terminated_length": 455.7594299316406, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2548343129456043, "epoch": 0.15295322635711847, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1354106855101298, "learning_rate": 9.924418864671708e-07, "loss": 0.0587, "num_tokens": 226964749.0, "reward": 1.12744140625, "reward_std": 0.14575591683387756, "rewards/accuracy_reward/mean": 0.14516128599643707, "rewards/accuracy_reward/std": 0.3526190221309662, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98681640625, "rewards/tag_count_reward/std": 0.09865544736385345, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 453.021484375, "completions/mean_terminated_length": 437.29193115234375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2390621304512024, "epoch": 0.15329463980880847, "frac_reward_zero_std": 0.625, "grad_norm": 0.11634594590020436, "learning_rate": 9.923436906501635e-07, "loss": 0.0426, "num_tokens": 227267464.0, "reward": 1.11474609375, "reward_std": 0.13143394887447357, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.06697065383195877, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1511.0, "completions/mean_length": 394.865234375, "completions/mean_terminated_length": 391.630126953125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.26389072835445404, "epoch": 0.15363605326049845, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11719425538574486, "learning_rate": 9.922448665307552e-07, "loss": 0.0276, "num_tokens": 227544451.0, "reward": 1.21337890625, "reward_std": 0.11902827024459839, "rewards/accuracy_reward/mean": 0.21484375, "rewards/accuracy_reward/std": 0.4111155867576599, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1754.0, "completions/mean_length": 411.705078125, "completions/mean_terminated_length": 405.28826904296875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2669088765978813, "epoch": 0.15397746671218845, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5313889504379087, "learning_rate": 9.921454142493155e-07, "loss": 0.0146, "num_tokens": 227836732.0, "reward": 1.111328125, "reward_std": 0.14310218393802643, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1485.0, "completions/mean_length": 444.552734375, "completions/mean_terminated_length": 386.1599426269531, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 0.28912773728370667, "epoch": 0.15431888016387846, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1318071946845383, "learning_rate": 9.920453339471052e-07, "loss": 0.0327, "num_tokens": 228140631.0, "reward": 1.05810546875, "reward_std": 0.12069268524646759, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97412109375, "rewards/tag_count_reward/std": 0.13613051176071167, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1532.0, "completions/mean_length": 386.796875, "completions/mean_terminated_length": 380.2823791503906, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.2995990365743637, "epoch": 0.15466029361556846, "frac_reward_zero_std": 0.625, "grad_norm": 0.13796951595396809, "learning_rate": 9.919446257662785e-07, "loss": 0.0377, "num_tokens": 228423935.0, "reward": 1.177734375, "reward_std": 0.11489175260066986, "rewards/accuracy_reward/mean": 0.181640625, "rewards/accuracy_reward/std": 0.38592514395713806, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.051725368946790695, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1530.0, "completions/mean_length": 424.271484375, "completions/mean_terminated_length": 411.4862060546875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.24779526516795158, "epoch": 0.15500170706725844, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1133479830032378, "learning_rate": 9.918432898498802e-07, "loss": 0.0171, "num_tokens": 228714826.0, "reward": 1.1015625, "reward_std": 0.12259122729301453, "rewards/accuracy_reward/mean": 0.11088709533214569, "rewards/accuracy_reward/std": 0.3143092691898346, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1446.0, "completions/mean_length": 486.5234375, "completions/mean_terminated_length": 464.87921142578125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.25646355748176575, "epoch": 0.15534312051894844, "frac_reward_zero_std": 0.625, "grad_norm": 0.10086488234814267, "learning_rate": 9.917413263418474e-07, "loss": 0.0776, "num_tokens": 229045318.0, "reward": 1.06787109375, "reward_std": 0.13897308707237244, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98974609375, "rewards/tag_count_reward/std": 0.08717872947454453, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1913.0, "completions/mean_length": 403.642578125, "completions/mean_terminated_length": 400.4246520996094, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.27720969915390015, "epoch": 0.15568453397063844, "frac_reward_zero_std": 0.75, "grad_norm": 0.11309105394405229, "learning_rate": 9.916387353870085e-07, "loss": 0.0212, "num_tokens": 229337295.0, "reward": 1.10400390625, "reward_std": 0.08999452739953995, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98291015625, "rewards/tag_count_reward/std": 0.09292974323034286, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1860.0, "completions/mean_length": 409.744140625, "completions/mean_terminated_length": 396.844482421875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.27122777700424194, "epoch": 0.15602594742232845, "frac_reward_zero_std": 0.59375, "grad_norm": 0.15338699374783707, "learning_rate": 9.91535517131083e-07, "loss": 0.0756, "num_tokens": 229617532.0, "reward": 1.091796875, "reward_std": 0.1278151571750641, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1842.0, "completions/mean_length": 417.810546875, "completions/mean_terminated_length": 414.620361328125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2386585995554924, "epoch": 0.15636736087401842, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1464187501189123, "learning_rate": 9.914316717206816e-07, "loss": 0.026, "num_tokens": 229909131.0, "reward": 1.15087890625, "reward_std": 0.1614130139350891, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1441.0, "completions/max_terminated_length": 1441.0, "completions/mean_length": 453.1328125, "completions/mean_terminated_length": 453.1328125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.2372812181711197, "epoch": 0.15670877432570843, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11070885930991516, "learning_rate": 9.913271993033058e-07, "loss": 0.0103, "num_tokens": 230216447.0, "reward": 1.201171875, "reward_std": 0.13381731510162354, "rewards/accuracy_reward/mean": 0.201171875, "rewards/accuracy_reward/std": 0.4012683033943176, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1648.0, "completions/max_terminated_length": 1648.0, "completions/mean_length": 432.427734375, "completions/mean_terminated_length": 432.427734375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.25607936829328537, "epoch": 0.15705018777739843, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1386399507911122, "learning_rate": 9.912221000273474e-07, "loss": -0.0202, "num_tokens": 230517738.0, "reward": 1.0546875, "reward_std": 0.1028757244348526, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 516.458984375, "completions/mean_terminated_length": 492.1488342285156, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.23069246858358383, "epoch": 0.15739160122908843, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1140627429363734, "learning_rate": 9.91116374042089e-07, "loss": 0.0282, "num_tokens": 230860021.0, "reward": 1.056640625, "reward_std": 0.16854147613048553, "rewards/accuracy_reward/mean": 0.09072580933570862, "rewards/accuracy_reward/std": 0.2875087857246399, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14334554970264435, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1097.0, "completions/mean_length": 442.33203125, "completions/mean_terminated_length": 429.68896484375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.22685323283076286, "epoch": 0.1577330146807784, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10253765103528983, "learning_rate": 9.910100214977032e-07, "loss": 0.0413, "num_tokens": 231163135.0, "reward": 1.1142578125, "reward_std": 0.11960331350564957, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1613.0, "completions/mean_length": 468.701171875, "completions/mean_terminated_length": 414.4949645996094, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.2566005140542984, "epoch": 0.15807442813246841, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10471804401732973, "learning_rate": 9.90903042545252e-07, "loss": 0.0188, "num_tokens": 231491494.0, "reward": 1.07080078125, "reward_std": 0.09641534090042114, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97509765625, "rewards/tag_count_reward/std": 0.13450638949871063, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1879.0, "completions/mean_length": 410.4140625, "completions/mean_terminated_length": 407.2093811035156, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.2548733055591583, "epoch": 0.15841584158415842, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10432075984695853, "learning_rate": 9.907954373366884e-07, "loss": 0.0221, "num_tokens": 231775242.0, "reward": 1.09228515625, "reward_std": 0.11698492616415024, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1577.0, "completions/max_terminated_length": 1577.0, "completions/mean_length": 433.97265625, "completions/mean_terminated_length": 433.97265625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 0.25420304387807846, "epoch": 0.15875725503584842, "frac_reward_zero_std": 0.75, "grad_norm": 0.10992253017862648, "learning_rate": 9.90687206024854e-07, "loss": -0.0116, "num_tokens": 232071532.0, "reward": 1.14453125, "reward_std": 0.0944494903087616, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 441.048828125, "completions/mean_terminated_length": 437.90411376953125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.2629197910428047, "epoch": 0.1590986684875384, "frac_reward_zero_std": 0.625, "grad_norm": 0.12113186385907584, "learning_rate": 9.905783487634796e-07, "loss": 0.0051, "num_tokens": 232372965.0, "reward": 1.10302734375, "reward_std": 0.12714646756649017, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.039800092577934265, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1840.0, "completions/mean_length": 453.83984375, "completions/mean_terminated_length": 434.936767578125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.238386869430542, "epoch": 0.1594400819392284, "frac_reward_zero_std": 0.625, "grad_norm": 0.1323910568345876, "learning_rate": 9.904688657071858e-07, "loss": 0.0851, "num_tokens": 232682787.0, "reward": 1.1044921875, "reward_std": 0.14320942759513855, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.08079168945550919, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1451.0, "completions/mean_length": 442.578125, "completions/mean_terminated_length": 429.93701171875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.24992968142032623, "epoch": 0.1597814953909184, "frac_reward_zero_std": 0.71875, "grad_norm": 0.1048962203471596, "learning_rate": 9.903587570114814e-07, "loss": 0.0538, "num_tokens": 232979563.0, "reward": 1.048828125, "reward_std": 0.1003195121884346, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1950.0, "completions/mean_length": 451.5390625, "completions/mean_terminated_length": 435.7948913574219, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.25596606731414795, "epoch": 0.1601229088426084, "frac_reward_zero_std": 0.75, "grad_norm": 0.14106870204479752, "learning_rate": 9.902480228327645e-07, "loss": 0.0484, "num_tokens": 233285871.0, "reward": 1.07373046875, "reward_std": 0.07759439200162888, "rewards/accuracy_reward/mean": 0.08266129344701767, "rewards/accuracy_reward/std": 0.2756475806236267, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.06697065383195877, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1556.0, "completions/mean_length": 451.529296875, "completions/mean_terminated_length": 435.7850036621094, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.26181329041719437, "epoch": 0.16046432229429838, "frac_reward_zero_std": 0.40625, "grad_norm": 0.17124367796737014, "learning_rate": 9.90136663328321e-07, "loss": 0.0766, "num_tokens": 233597774.0, "reward": 1.15380859375, "reward_std": 0.20297390222549438, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98974609375, "rewards/tag_count_reward/std": 0.08286299556493759, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 461.009765625, "completions/mean_terminated_length": 448.5137634277344, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.23529412224888802, "epoch": 0.1608057357459884, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1547088534005227, "learning_rate": 9.900246786563254e-07, "loss": 0.0518, "num_tokens": 233903395.0, "reward": 1.125, "reward_std": 0.17914237082004547, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1663.0, "completions/mean_length": 453.0, "completions/mean_terminated_length": 443.5992431640625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.2535926587879658, "epoch": 0.1611471491976784, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13702565671379552, "learning_rate": 9.8991206897584e-07, "loss": 0.0484, "num_tokens": 234201043.0, "reward": 1.11962890625, "reward_std": 0.15555620193481445, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.06134068965911865, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1516.0, "completions/mean_length": 451.767578125, "completions/mean_terminated_length": 432.8399353027344, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.2339429035782814, "epoch": 0.1614885626493684, "frac_reward_zero_std": 0.625, "grad_norm": 0.12773479607449417, "learning_rate": 9.897988344468148e-07, "loss": 0.0961, "num_tokens": 234511212.0, "reward": 1.1533203125, "reward_std": 0.15923118591308594, "rewards/accuracy_reward/mean": 0.162109375, "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.08079168945550919, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1467.0, "completions/mean_length": 418.978515625, "completions/mean_terminated_length": 409.3772277832031, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.26347074657678604, "epoch": 0.16182997610105837, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11425176471718482, "learning_rate": 9.89684975230088e-07, "loss": 0.0281, "num_tokens": 234794433.0, "reward": 1.13525390625, "reward_std": 0.10248175263404846, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.06134068965911865, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1435.0, "completions/mean_length": 490.4765625, "completions/mean_terminated_length": 465.7539978027344, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.23065516352653503, "epoch": 0.16217138955274837, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1024599387507985, "learning_rate": 9.895704914873838e-07, "loss": 0.0746, "num_tokens": 235122149.0, "reward": 1.0615234375, "reward_std": 0.13101303577423096, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9873046875, "rewards/tag_count_reward/std": 0.0942835807800293, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1884.0, "completions/mean_length": 403.021484375, "completions/mean_terminated_length": 393.3261413574219, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.23955966159701347, "epoch": 0.16251280300443838, "frac_reward_zero_std": 0.4375, "grad_norm": 0.15608518922303552, "learning_rate": 9.89455383381315e-07, "loss": 0.0537, "num_tokens": 235405696.0, "reward": 1.15869140625, "reward_std": 0.2013128399848938, "rewards/accuracy_reward/mean": 0.16935484111309052, "rewards/accuracy_reward/std": 0.3754436671733856, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.06134068965911865, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1963.0, "completions/mean_length": 424.904296875, "completions/mean_terminated_length": 415.3379211425781, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.2692762389779091, "epoch": 0.16285421645612838, "frac_reward_zero_std": 0.65625, "grad_norm": 0.131061104552567, "learning_rate": 9.893396510753802e-07, "loss": 0.0545, "num_tokens": 235700591.0, "reward": 1.08056640625, "reward_std": 0.12009972333908081, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.06134068965911865, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1833.0, "completions/mean_length": 466.869140625, "completions/mean_terminated_length": 457.55010986328125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.2509899064898491, "epoch": 0.16319562990781836, "frac_reward_zero_std": 0.71875, "grad_norm": 0.09731123445089242, "learning_rate": 9.892232947339646e-07, "loss": 0.056, "num_tokens": 236016668.0, "reward": 1.07666015625, "reward_std": 0.10727764666080475, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.06134068965911865, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1708.0, "completions/mean_length": 480.140625, "completions/mean_terminated_length": 461.5494384765625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.24840113893151283, "epoch": 0.16353704335950836, "frac_reward_zero_std": 0.71875, "grad_norm": 0.10144863715695365, "learning_rate": 9.891063145223405e-07, "loss": 0.076, "num_tokens": 236335956.0, "reward": 1.0205078125, "reward_std": 0.08526228368282318, "rewards/accuracy_reward/mean": 0.029296875, "rewards/accuracy_reward/std": 0.16880230605602264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.08079168945550919, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1298.0, "completions/mean_length": 447.203125, "completions/mean_terminated_length": 444.0704345703125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.21248015761375427, "epoch": 0.16387845681119836, "frac_reward_zero_std": 0.625, "grad_norm": 0.12299207234586733, "learning_rate": 9.889887106066654e-07, "loss": 0.0217, "num_tokens": 236642860.0, "reward": 1.14990234375, "reward_std": 0.13756723701953888, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.039800092577934265, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1429.0, "completions/mean_length": 454.82421875, "completions/mean_terminated_length": 445.4342041015625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.24622974544763565, "epoch": 0.16421987026288837, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1301096458356534, "learning_rate": 9.888704831539838e-07, "loss": 0.0222, "num_tokens": 236950194.0, "reward": 1.13134765625, "reward_std": 0.15281695127487183, "rewards/accuracy_reward/mean": 0.1391129046678543, "rewards/accuracy_reward/std": 0.34641367197036743, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 488.931640625, "completions/mean_terminated_length": 470.4446716308594, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.24348296597599983, "epoch": 0.16456128371457834, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12448441903411532, "learning_rate": 9.88751632332225e-07, "loss": 0.0929, "num_tokens": 237284511.0, "reward": 1.07568359375, "reward_std": 0.10906431078910828, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99169921875, "rewards/tag_count_reward/std": 0.0753624215722084, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1223.0, "completions/mean_length": 437.3828125, "completions/mean_terminated_length": 424.7007751464844, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2259848788380623, "epoch": 0.16490269716626835, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10157234847221762, "learning_rate": 9.88632158310204e-07, "loss": 0.0438, "num_tokens": 237580947.0, "reward": 1.08740234375, "reward_std": 0.11121685802936554, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99169921875, "rewards/tag_count_reward/std": 0.07696826010942459, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1299.0, "completions/mean_length": 429.26171875, "completions/mean_terminated_length": 413.2978210449219, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.23638497292995453, "epoch": 0.16524411061795835, "frac_reward_zero_std": 0.625, "grad_norm": 0.12227524601894259, "learning_rate": 9.885120612576208e-07, "loss": 0.085, "num_tokens": 237880937.0, "reward": 1.11572265625, "reward_std": 0.12642371654510498, "rewards/accuracy_reward/mean": 0.1270161271095276, "rewards/accuracy_reward/std": 0.3333272337913513, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 465.48828125, "completions/mean_terminated_length": 459.2823791503906, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.24573494121432304, "epoch": 0.16558552406964835, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1464196374691592, "learning_rate": 9.88391341345061e-07, "loss": 0.0408, "num_tokens": 238210467.0, "reward": 1.14013671875, "reward_std": 0.17286238074302673, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1611.0, "completions/mean_length": 451.75390625, "completions/mean_terminated_length": 439.1850280761719, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.25005803257226944, "epoch": 0.16592693752133833, "frac_reward_zero_std": 0.625, "grad_norm": 0.12160140773151035, "learning_rate": 9.882699987439943e-07, "loss": 0.0233, "num_tokens": 238523093.0, "reward": 1.0791015625, "reward_std": 0.13354924321174622, "rewards/accuracy_reward/mean": 0.08870967477560043, "rewards/accuracy_reward/std": 0.284611314535141, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06960996240377426, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1674.0, "completions/mean_length": 428.009765625, "completions/mean_terminated_length": 418.4617004394531, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.278684601187706, "epoch": 0.16626835097302833, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11841307651538391, "learning_rate": 9.88148033626775e-07, "loss": 0.0411, "num_tokens": 238818954.0, "reward": 1.09326171875, "reward_std": 0.10769633948802948, "rewards/accuracy_reward/mean": 0.10080645233392715, "rewards/accuracy_reward/std": 0.30137622356414795, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 511.8359375, "completions/mean_terminated_length": 490.5426025390625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.21939270943403244, "epoch": 0.16660976442471834, "frac_reward_zero_std": 0.5625, "grad_norm": 0.129214480355415, "learning_rate": 9.880254461666415e-07, "loss": 0.0807, "num_tokens": 239155382.0, "reward": 1.08544921875, "reward_std": 0.13240471482276917, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98974609375, "rewards/tag_count_reward/std": 0.08717872947454453, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 478.05859375, "completions/mean_terminated_length": 462.575927734375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.24564239755272865, "epoch": 0.16695117787640834, "frac_reward_zero_std": 0.71875, "grad_norm": 0.0933061085601777, "learning_rate": 9.879022365377164e-07, "loss": 0.0423, "num_tokens": 239485204.0, "reward": 1.07080078125, "reward_std": 0.09408167004585266, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1736.0, "completions/mean_length": 448.6875, "completions/mean_terminated_length": 442.41571044921875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.24718814715743065, "epoch": 0.16729259132809832, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13615977430329543, "learning_rate": 9.877784049150062e-07, "loss": 0.021, "num_tokens": 239793252.0, "reward": 1.1474609375, "reward_std": 0.15849491953849792, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06600234657526016, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 484.5, "completions/mean_terminated_length": 450.1716613769531, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.23883453011512756, "epoch": 0.16763400477978832, "frac_reward_zero_std": 0.5, "grad_norm": 0.1391956732222375, "learning_rate": 9.876539514744e-07, "loss": 0.1102, "num_tokens": 240115524.0, "reward": 1.08349609375, "reward_std": 0.1839390993118286, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98388671875, "rewards/tag_count_reward/std": 0.10885065048933029, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1875.0, "completions/mean_length": 477.634765625, "completions/mean_terminated_length": 455.8673400878906, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.24501565471291542, "epoch": 0.16797541823147832, "frac_reward_zero_std": 0.625, "grad_norm": 0.11323199724431479, "learning_rate": 9.875288763926716e-07, "loss": 0.0439, "num_tokens": 240435017.0, "reward": 1.18408203125, "reward_std": 0.13257968425750732, "rewards/accuracy_reward/mean": 0.193359375, "rewards/accuracy_reward/std": 0.39531853795051575, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99072265625, "rewards/tag_count_reward/std": 0.0814909115433693, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1681.0, "completions/mean_length": 493.884765625, "completions/mean_terminated_length": 487.79022216796875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.23549573868513107, "epoch": 0.16831683168316833, "frac_reward_zero_std": 0.625, "grad_norm": 0.10640350888846618, "learning_rate": 9.87403179847476e-07, "loss": 0.0127, "num_tokens": 240762878.0, "reward": 1.15283203125, "reward_std": 0.154703289270401, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.36344730854034424, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 413.861328125, "completions/mean_terminated_length": 394.48419189453125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.2689219117164612, "epoch": 0.1686582451348583, "frac_reward_zero_std": 0.625, "grad_norm": 0.12518157435531385, "learning_rate": 9.872768620173523e-07, "loss": 0.0492, "num_tokens": 241043447.0, "reward": 1.0869140625, "reward_std": 0.14506080746650696, "rewards/accuracy_reward/mean": 0.09879032522439957, "rewards/accuracy_reward/std": 0.2986815273761749, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.08079168945550919, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1630.0, "completions/mean_length": 398.431640625, "completions/mean_terminated_length": 391.9627685546875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.3005271628499031, "epoch": 0.1689996585865483, "frac_reward_zero_std": 0.625, "grad_norm": 0.14531839526994986, "learning_rate": 9.87149923081722e-07, "loss": -0.0258, "num_tokens": 241328564.0, "reward": 1.134765625, "reward_std": 0.1478579342365265, "rewards/accuracy_reward/mean": 0.1411290317773819, "rewards/accuracy_reward/std": 0.3485061228275299, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 452.3203125, "completions/mean_terminated_length": 439.7558898925781, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.24264652654528618, "epoch": 0.1693410720382383, "frac_reward_zero_std": 0.40625, "grad_norm": 0.1377801937753084, "learning_rate": 9.870223632208875e-07, "loss": 0.0523, "num_tokens": 241630968.0, "reward": 1.1796875, "reward_std": 0.2017083615064621, "rewards/accuracy_reward/mean": 0.185546875, "rewards/accuracy_reward/std": 0.38912075757980347, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1257.0, "completions/mean_length": 435.09765625, "completions/mean_terminated_length": 422.39764404296875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.2606082111597061, "epoch": 0.16968248548992831, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12464908449432739, "learning_rate": 9.868941826160349e-07, "loss": 0.0165, "num_tokens": 241930442.0, "reward": 1.08203125, "reward_std": 0.12042922526597977, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1655.0, "completions/mean_length": 431.15234375, "completions/mean_terminated_length": 427.9882507324219, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.2544957175850868, "epoch": 0.1700238989416183, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11559079161560178, "learning_rate": 9.86765381449231e-07, "loss": 0.021, "num_tokens": 242231736.0, "reward": 1.15283203125, "reward_std": 0.12339599430561066, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1944.0, "completions/mean_length": 476.958984375, "completions/mean_terminated_length": 458.3300476074219, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.2490190751850605, "epoch": 0.1703653123933083, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1359257831214116, "learning_rate": 9.866359599034239e-07, "loss": 0.0692, "num_tokens": 242551827.0, "reward": 1.09228515625, "reward_std": 0.15846316516399384, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99072265625, "rewards/tag_count_reward/std": 0.0814909115433693, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1307.0, "completions/mean_length": 429.8515625, "completions/mean_terminated_length": 423.50592041015625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.2846025973558426, "epoch": 0.1707067258449983, "frac_reward_zero_std": 0.625, "grad_norm": 0.1321352652602892, "learning_rate": 9.865059181624434e-07, "loss": 0.007, "num_tokens": 242846503.0, "reward": 1.1884765625, "reward_std": 0.12783613801002502, "rewards/accuracy_reward/mean": 0.19140625, "rewards/accuracy_reward/std": 0.3937928080558777, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1765.0, "completions/mean_length": 481.740234375, "completions/mean_terminated_length": 478.6751403808594, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.221136212348938, "epoch": 0.1710481392966883, "frac_reward_zero_std": 0.625, "grad_norm": 0.08871846221030062, "learning_rate": 9.863752564110003e-07, "loss": 0.0056, "num_tokens": 243173026.0, "reward": 1.130859375, "reward_std": 0.12943288683891296, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1560.0, "completions/mean_length": 426.73046875, "completions/mean_terminated_length": 420.3725891113281, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.25116079673171043, "epoch": 0.17138955274837828, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1198048793952026, "learning_rate": 9.862439748346854e-07, "loss": 0.0132, "num_tokens": 243465064.0, "reward": 1.10107421875, "reward_std": 0.10621839761734009, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1874.0, "completions/mean_length": 493.138671875, "completions/mean_terminated_length": 477.80474853515625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.23132016137242317, "epoch": 0.17173096620006828, "frac_reward_zero_std": 0.46875, "grad_norm": 0.15163467978115538, "learning_rate": 9.861120736199701e-07, "loss": 0.0961, "num_tokens": 243801119.0, "reward": 1.10595703125, "reward_std": 0.16933369636535645, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.0704338550567627, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1234.0, "completions/mean_length": 438.03515625, "completions/mean_terminated_length": 434.8845520019531, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.23953776061534882, "epoch": 0.17207237965175828, "frac_reward_zero_std": 0.71875, "grad_norm": 0.10376938537997743, "learning_rate": 9.859795529542069e-07, "loss": 0.0149, "num_tokens": 244103777.0, "reward": 1.109375, "reward_std": 0.10398061573505402, "rewards/accuracy_reward/mean": 0.11491935700178146, "rewards/accuracy_reward/std": 0.3192465901374817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1686.0, "completions/mean_length": 437.78515625, "completions/mean_terminated_length": 431.4706115722656, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.28928231447935104, "epoch": 0.1724137931034483, "frac_reward_zero_std": 0.5625, "grad_norm": 0.14707049135645697, "learning_rate": 9.858464130256268e-07, "loss": 0.0474, "num_tokens": 244402051.0, "reward": 1.2119140625, "reward_std": 0.1662474423646927, "rewards/accuracy_reward/mean": 0.21484375, "rewards/accuracy_reward/std": 0.4111155867576599, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1691.0, "completions/mean_length": 462.552734375, "completions/mean_terminated_length": 456.3353271484375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.2459535077214241, "epoch": 0.17275520655513826, "frac_reward_zero_std": 0.53125, "grad_norm": 0.13718035553767696, "learning_rate": 9.85712654023341e-07, "loss": 0.0464, "num_tokens": 244717502.0, "reward": 1.1748046875, "reward_std": 0.1837637722492218, "rewards/accuracy_reward/mean": 0.177734375, "rewards/accuracy_reward/std": 0.3826628625392914, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1174.0, "completions/mean_length": 454.408203125, "completions/mean_terminated_length": 448.1588439941406, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.24907024577260017, "epoch": 0.17309662000682827, "frac_reward_zero_std": 0.625, "grad_norm": 0.10999151448695513, "learning_rate": 9.855782761373402e-07, "loss": 0.0175, "num_tokens": 245030591.0, "reward": 1.1318359375, "reward_std": 0.12418824434280396, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1352.0, "completions/mean_length": 449.015625, "completions/mean_terminated_length": 442.7451171875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.26462653279304504, "epoch": 0.17343803345851827, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11317360626665804, "learning_rate": 9.854432795584938e-07, "loss": 0.0192, "num_tokens": 245342375.0, "reward": 1.1025390625, "reward_std": 0.13826242089271545, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1539.0, "completions/mean_length": 503.42578125, "completions/mean_terminated_length": 450.4101257324219, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.2709595486521721, "epoch": 0.17377944691020827, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11064161746009978, "learning_rate": 9.853076644785505e-07, "loss": 0.0559, "num_tokens": 245687073.0, "reward": 1.04443359375, "reward_std": 0.09091901779174805, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97412109375, "rewards/tag_count_reward/std": 0.13613051176071167, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 528.05859375, "completions/mean_terminated_length": 513.0690307617188, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.21518215164542198, "epoch": 0.17412086036189825, "frac_reward_zero_std": 0.75, "grad_norm": 0.07982094958423591, "learning_rate": 9.851714310901365e-07, "loss": 0.0373, "num_tokens": 246030959.0, "reward": 1.08740234375, "reward_std": 0.09434814751148224, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99169921875, "rewards/tag_count_reward/std": 0.07696826010942459, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1622.0, "completions/mean_length": 430.916015625, "completions/mean_terminated_length": 424.57452392578125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.2581406496465206, "epoch": 0.17446227381358825, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11084882395487476, "learning_rate": 9.850345795867576e-07, "loss": 0.0262, "num_tokens": 246330884.0, "reward": 1.060546875, "reward_std": 0.08098877221345901, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1584.0, "completions/mean_length": 447.150390625, "completions/mean_terminated_length": 434.5452880859375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2711602970957756, "epoch": 0.17480368726527826, "frac_reward_zero_std": 0.625, "grad_norm": 0.12986198607090976, "learning_rate": 9.848971101627965e-07, "loss": 0.0492, "num_tokens": 246635665.0, "reward": 1.060546875, "reward_std": 0.1290496587753296, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1640.0, "completions/mean_length": 465.599609375, "completions/mean_terminated_length": 459.3941345214844, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.26395220309495926, "epoch": 0.17514510071696826, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11575981724561246, "learning_rate": 9.847590230135142e-07, "loss": 0.0185, "num_tokens": 246958388.0, "reward": 1.1083984375, "reward_std": 0.1350170373916626, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1759.0, "completions/mean_length": 447.8359375, "completions/mean_terminated_length": 444.7044982910156, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.2793165296316147, "epoch": 0.17548651416865824, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13631518377413399, "learning_rate": 9.846203183350486e-07, "loss": 0.0221, "num_tokens": 247266672.0, "reward": 1.103515625, "reward_std": 0.16034367680549622, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1764.0, "completions/mean_length": 475.087890625, "completions/mean_terminated_length": 440.56884765625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.2806601896882057, "epoch": 0.17582792762034824, "frac_reward_zero_std": 0.59375, "grad_norm": 0.15703813169390562, "learning_rate": 9.844809963244153e-07, "loss": 0.0522, "num_tokens": 247587405.0, "reward": 1.11767578125, "reward_std": 0.13633525371551514, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97705078125, "rewards/tag_count_reward/std": 0.12048374861478806, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1832.0, "completions/mean_length": 485.1328125, "completions/mean_terminated_length": 475.9214172363281, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.24153149127960205, "epoch": 0.17616934107203824, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12851972143047175, "learning_rate": 9.84341057179506e-07, "loss": 0.0076, "num_tokens": 247919601.0, "reward": 1.13671875, "reward_std": 0.1737358570098877, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.062285590916872025, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1480.0, "completions/mean_length": 461.98046875, "completions/mean_terminated_length": 455.76080322265625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.22365814819931984, "epoch": 0.17651075452372825, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11298910962520343, "learning_rate": 9.8420050109909e-07, "loss": 0.0299, "num_tokens": 248233431.0, "reward": 1.1025390625, "reward_std": 0.0970916822552681, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1922.0, "completions/mean_length": 553.5078125, "completions/mean_terminated_length": 483.2474365234375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.23243922740221024, "epoch": 0.17685216797541822, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1469050696737627, "learning_rate": 9.840593282828121e-07, "loss": 0.0635, "num_tokens": 248601195.0, "reward": 1.0322265625, "reward_std": 0.13319778442382812, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9677734375, "rewards/tag_count_reward/std": 0.14816728234291077, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1913.0, "completions/mean_length": 485.921875, "completions/mean_terminated_length": 476.71514892578125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.25511204451322556, "epoch": 0.17719358142710823, "frac_reward_zero_std": 0.53125, "grad_norm": 0.13458338507530604, "learning_rate": 9.839175389311934e-07, "loss": 0.0347, "num_tokens": 248931923.0, "reward": 1.08544921875, "reward_std": 0.14247199892997742, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1952.0, "completions/mean_length": 457.865234375, "completions/mean_terminated_length": 454.75341796875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.2627585083246231, "epoch": 0.17753499487879823, "frac_reward_zero_std": 0.75, "grad_norm": 0.10540769722482222, "learning_rate": 9.837751332456306e-07, "loss": 0.0165, "num_tokens": 249245710.0, "reward": 1.11962890625, "reward_std": 0.09256304800510406, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1814.0, "completions/mean_length": 414.9609375, "completions/mean_terminated_length": 411.7651672363281, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.27028291672468185, "epoch": 0.17787640833048823, "frac_reward_zero_std": 0.625, "grad_norm": 0.16039963489679282, "learning_rate": 9.83632111428396e-07, "loss": 0.0131, "num_tokens": 249541370.0, "reward": 1.07666015625, "reward_std": 0.136118546128273, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1419.0, "completions/mean_length": 488.810546875, "completions/mean_terminated_length": 485.75927734375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.23474112153053284, "epoch": 0.1782178217821782, "frac_reward_zero_std": 0.875, "grad_norm": 0.0798174914396759, "learning_rate": 9.834884736826366e-07, "loss": 0.0157, "num_tokens": 249869081.0, "reward": 1.06298828125, "reward_std": 0.054977502673864365, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1915.0, "completions/mean_length": 471.796875, "completions/mean_terminated_length": 456.2524719238281, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.2584024891257286, "epoch": 0.1785592352338682, "frac_reward_zero_std": 0.78125, "grad_norm": 0.10315317970297926, "learning_rate": 9.833442202123754e-07, "loss": 0.083, "num_tokens": 250184529.0, "reward": 1.03759765625, "reward_std": 0.06906723231077194, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1925.0, "completions/mean_length": 459.97265625, "completions/mean_terminated_length": 450.61297607421875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 0.24765276536345482, "epoch": 0.17890064868555822, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1254224503530009, "learning_rate": 9.831993512225084e-07, "loss": 0.0262, "num_tokens": 250494627.0, "reward": 1.17724609375, "reward_std": 0.18510189652442932, "rewards/accuracy_reward/mean": 0.181640625, "rewards/accuracy_reward/std": 0.38592514395713806, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1914.0, "completions/mean_length": 423.556640625, "completions/mean_terminated_length": 413.9823303222656, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.28037848323583603, "epoch": 0.17924206213724822, "frac_reward_zero_std": 0.625, "grad_norm": 0.12326630890657676, "learning_rate": 9.830538669188068e-07, "loss": 0.0575, "num_tokens": 250792608.0, "reward": 1.1064453125, "reward_std": 0.12281246483325958, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1501.0, "completions/mean_length": 520.119140625, "completions/mean_terminated_length": 486.5728454589844, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.23646292462944984, "epoch": 0.1795834755889382, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11945086073390986, "learning_rate": 9.829077675079162e-07, "loss": 0.0404, "num_tokens": 251137629.0, "reward": 1.10791015625, "reward_std": 0.16453522443771362, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98291015625, "rewards/tag_count_reward/std": 0.11092886328697205, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1417.0, "completions/mean_length": 518.509765625, "completions/mean_terminated_length": 466.01416015625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.25329068303108215, "epoch": 0.1799248890406282, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1290786121778825, "learning_rate": 9.827610531973547e-07, "loss": 0.0272, "num_tokens": 251485842.0, "reward": 1.09716796875, "reward_std": 0.16562320291996002, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97412109375, "rewards/tag_count_reward/std": 0.13613051176071167, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1328.0, "completions/mean_length": 418.85546875, "completions/mean_terminated_length": 415.6673278808594, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.24123474210500717, "epoch": 0.1802663024923182, "frac_reward_zero_std": 0.6875, "grad_norm": 0.13767069205538437, "learning_rate": 9.826137241955148e-07, "loss": 0.0143, "num_tokens": 251771672.0, "reward": 1.14306640625, "reward_std": 0.10565373301506042, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 459.83984375, "completions/mean_terminated_length": 447.33465576171875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.24282551184296608, "epoch": 0.1806077159440082, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11603709271850994, "learning_rate": 9.824657807116617e-07, "loss": 0.0343, "num_tokens": 252079574.0, "reward": 1.119140625, "reward_std": 0.1238507628440857, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1956.0, "completions/mean_length": 503.365234375, "completions/mean_terminated_length": 488.13214111328125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.24303887784481049, "epoch": 0.18094912939569818, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12974838886924717, "learning_rate": 9.823172229559335e-07, "loss": 0.0718, "num_tokens": 252416417.0, "reward": 1.09619140625, "reward_std": 0.15677280724048615, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1508.0, "completions/mean_length": 434.669921875, "completions/mean_terminated_length": 428.3431701660156, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.2296394407749176, "epoch": 0.1812905428473882, "frac_reward_zero_std": 0.6875, "grad_norm": 0.13111344398702232, "learning_rate": 9.821680511393407e-07, "loss": 0.0293, "num_tokens": 252719560.0, "reward": 1.095703125, "reward_std": 0.12054785341024399, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04930410906672478, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 474.34375, "completions/mean_terminated_length": 455.6838073730469, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.2545858696103096, "epoch": 0.1816319562990782, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11438043985893924, "learning_rate": 9.82018265473766e-07, "loss": 0.0587, "num_tokens": 253046424.0, "reward": 1.1357421875, "reward_std": 0.1431073546409607, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.08079168945550919, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1789.0, "completions/mean_length": 525.8671875, "completions/mean_terminated_length": 519.8980712890625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.21306277438998222, "epoch": 0.1819733697507682, "frac_reward_zero_std": 0.5, "grad_norm": 0.1196684092971572, "learning_rate": 9.818678661719642e-07, "loss": 0.0314, "num_tokens": 253390580.0, "reward": 1.1240234375, "reward_std": 0.1667175590991974, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1324.0, "completions/mean_length": 428.69140625, "completions/mean_terminated_length": 419.1473693847656, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.27190878987312317, "epoch": 0.18231478320245817, "frac_reward_zero_std": 0.59375, "grad_norm": 0.144234447598751, "learning_rate": 9.817168534475617e-07, "loss": 0.0435, "num_tokens": 253679558.0, "reward": 1.14013671875, "reward_std": 0.15589958429336548, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1436.0, "completions/mean_length": 437.185546875, "completions/mean_terminated_length": 434.03326416015625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.249112106859684, "epoch": 0.18265619665414817, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1080772090443539, "learning_rate": 9.81565227515056e-07, "loss": 0.0092, "num_tokens": 253978021.0, "reward": 1.1826171875, "reward_std": 0.14269787073135376, "rewards/accuracy_reward/mean": 0.19153225421905518, "rewards/accuracy_reward/std": 0.3939041793346405, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 491.591796875, "completions/mean_terminated_length": 441.4132995605469, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.2406299039721489, "epoch": 0.18299761010583818, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11538375087106344, "learning_rate": 9.814129885898154e-07, "loss": 0.0532, "num_tokens": 254307508.0, "reward": 1.04296875, "reward_std": 0.1101660206913948, "rewards/accuracy_reward/mean": 0.07056451588869095, "rewards/accuracy_reward/std": 0.25635457038879395, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.974609375, "rewards/tag_count_reward/std": 0.13212046027183533, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 479.302734375, "completions/mean_terminated_length": 466.9507751464844, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.21413543447852135, "epoch": 0.18333902355752818, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13361956383321316, "learning_rate": 9.812601368880796e-07, "loss": 0.0556, "num_tokens": 254624735.0, "reward": 1.07763671875, "reward_std": 0.12238923460245132, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99169921875, "rewards/tag_count_reward/std": 0.07372161000967026, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1599.0, "completions/mean_length": 447.255859375, "completions/mean_terminated_length": 444.123291015625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.21913380548357964, "epoch": 0.18368043700921816, "frac_reward_zero_std": 0.71875, "grad_norm": 0.10096496946648675, "learning_rate": 9.811066726269582e-07, "loss": 0.04, "num_tokens": 254923778.0, "reward": 1.13330078125, "reward_std": 0.12248915433883667, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1443.0, "completions/mean_length": 448.220703125, "completions/mean_terminated_length": 435.6240234375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.25525109469890594, "epoch": 0.18402185046090816, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12415279688786747, "learning_rate": 9.809525960244308e-07, "loss": 0.0435, "num_tokens": 255229443.0, "reward": 1.0859375, "reward_std": 0.14568951725959778, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1660.0, "completions/mean_length": 455.5, "completions/mean_terminated_length": 449.2549133300781, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.25586118176579475, "epoch": 0.18436326391259816, "frac_reward_zero_std": 0.78125, "grad_norm": 0.09247710802551404, "learning_rate": 9.807979072993469e-07, "loss": 0.0202, "num_tokens": 255535635.0, "reward": 1.0654296875, "reward_std": 0.07089129090309143, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1667.0, "completions/mean_length": 466.65625, "completions/mean_terminated_length": 457.3359680175781, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.24643894284963608, "epoch": 0.18470467736428817, "frac_reward_zero_std": 0.5, "grad_norm": 0.1457263346113379, "learning_rate": 9.806426066714256e-07, "loss": 0.0331, "num_tokens": 255855027.0, "reward": 1.12353515625, "reward_std": 0.1518966555595398, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1956.0, "completions/mean_length": 408.0078125, "completions/mean_terminated_length": 398.34185791015625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.2810538187623024, "epoch": 0.18504609081597814, "frac_reward_zero_std": 0.78125, "grad_norm": 0.09652302972096004, "learning_rate": 9.804866943612547e-07, "loss": 0.0318, "num_tokens": 256147015.0, "reward": 1.16552734375, "reward_std": 0.07617424428462982, "rewards/accuracy_reward/mean": 0.169921875, "rewards/accuracy_reward/std": 0.3759314715862274, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1798.0, "completions/mean_length": 462.560546875, "completions/mean_terminated_length": 453.21612548828125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.2707936093211174, "epoch": 0.18538750426766815, "frac_reward_zero_std": 0.59375, "grad_norm": 0.126410072569272, "learning_rate": 9.803301705902917e-07, "loss": 0.0529, "num_tokens": 256465430.0, "reward": 1.09326171875, "reward_std": 0.13709528744220734, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1523.0, "completions/mean_length": 447.5078125, "completions/mean_terminated_length": 441.23138427734375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.22264325246214867, "epoch": 0.18572891771935815, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13480108868019294, "learning_rate": 9.801730355808616e-07, "loss": 0.0033, "num_tokens": 256766938.0, "reward": 1.2548828125, "reward_std": 0.23324057459831238, "rewards/accuracy_reward/mean": 0.2578125, "rewards/accuracy_reward/std": 0.43785804510116577, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1403.0, "completions/mean_length": 456.08984375, "completions/mean_terminated_length": 434.0237731933594, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.2573372945189476, "epoch": 0.18607033117104813, "frac_reward_zero_std": 0.40625, "grad_norm": 0.160145406069245, "learning_rate": 9.80015289556158e-07, "loss": 0.0793, "num_tokens": 257082888.0, "reward": 1.21337890625, "reward_std": 0.23930276930332184, "rewards/accuracy_reward/mean": 0.224609375, "rewards/accuracy_reward/std": 0.41773295402526855, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98876953125, "rewards/tag_count_reward/std": 0.08982396870851517, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1581.0, "completions/mean_length": 446.431640625, "completions/mean_terminated_length": 430.6370849609375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.23744409903883934, "epoch": 0.18641174462273813, "frac_reward_zero_std": 0.40625, "grad_norm": 0.14493262474168972, "learning_rate": 9.798569327402428e-07, "loss": 0.0659, "num_tokens": 257389573.0, "reward": 1.24658203125, "reward_std": 0.2167855203151703, "rewards/accuracy_reward/mean": 0.25390625, "rewards/accuracy_reward/std": 0.43567025661468506, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1523.0, "completions/mean_length": 482.19921875, "completions/mean_terminated_length": 472.9705505371094, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.2553015649318695, "epoch": 0.18675315807442813, "frac_reward_zero_std": 0.75, "grad_norm": 0.09018463916842347, "learning_rate": 9.79697965358045e-07, "loss": 0.0434, "num_tokens": 257713755.0, "reward": 1.14404296875, "reward_std": 0.10000023245811462, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 482.72265625, "completions/mean_terminated_length": 470.39764404296875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.24613632634282112, "epoch": 0.18709457152611814, "frac_reward_zero_std": 0.625, "grad_norm": 0.12694952134619408, "learning_rate": 9.795383876353606e-07, "loss": -0.0013, "num_tokens": 258046333.0, "reward": 1.14453125, "reward_std": 0.1477690190076828, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1663.0, "completions/mean_length": 495.5703125, "completions/mean_terminated_length": 480.2603759765625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.22378743812441826, "epoch": 0.1874359849778081, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1171198296706225, "learning_rate": 9.793781997988532e-07, "loss": 0.0673, "num_tokens": 258368785.0, "reward": 1.06640625, "reward_std": 0.13980433344841003, "rewards/accuracy_reward/mean": 0.07661290466785431, "rewards/accuracy_reward/std": 0.2662447690963745, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.07459938526153564, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1741.0, "completions/mean_length": 513.7734375, "completions/mean_terminated_length": 495.5810546875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.24012670665979385, "epoch": 0.18777739842949812, "frac_reward_zero_std": 0.5, "grad_norm": 0.13926996851498435, "learning_rate": 9.792174020760524e-07, "loss": 0.058, "num_tokens": 258706333.0, "reward": 1.15234375, "reward_std": 0.17002829909324646, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.07459938526153564, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1861.0, "completions/mean_length": 511.4609375, "completions/mean_terminated_length": 465.0865173339844, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.24468469247221947, "epoch": 0.18811881188118812, "frac_reward_zero_std": 0.5, "grad_norm": 0.13642121530996965, "learning_rate": 9.790559946953549e-07, "loss": 0.1412, "num_tokens": 259048985.0, "reward": 1.03857421875, "reward_std": 0.16668321192264557, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97802734375, "rewards/tag_count_reward/std": 0.12660174071788788, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1636.0, "completions/mean_length": 502.98828125, "completions/mean_terminated_length": 493.88214111328125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.23400313407182693, "epoch": 0.18846022533287812, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1160663071063458, "learning_rate": 9.788939778860224e-07, "loss": 0.0453, "num_tokens": 259376195.0, "reward": 1.12451171875, "reward_std": 0.1133580207824707, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1408.0, "completions/mean_length": 409.7578125, "completions/mean_terminated_length": 400.1021728515625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.2742925062775612, "epoch": 0.1888016387845681, "frac_reward_zero_std": 0.375, "grad_norm": 0.19957789899159514, "learning_rate": 9.787313518781823e-07, "loss": 0.0201, "num_tokens": 259662087.0, "reward": 1.23486328125, "reward_std": 0.22128403186798096, "rewards/accuracy_reward/mean": 0.240234375, "rewards/accuracy_reward/std": 0.4276435375213623, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.06134068965911865, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1660.0, "completions/mean_length": 460.33984375, "completions/mean_terminated_length": 447.8385925292969, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.23115480691194534, "epoch": 0.1891430522362581, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10605662531233141, "learning_rate": 9.785681169028283e-07, "loss": 0.032, "num_tokens": 259977205.0, "reward": 1.1240234375, "reward_std": 0.14107146859169006, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 496.3984375, "completions/mean_terminated_length": 439.89068603515625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.24203843995928764, "epoch": 0.1894844656879481, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12130840369017926, "learning_rate": 9.784042731918182e-07, "loss": 0.0632, "num_tokens": 260307633.0, "reward": 1.0888671875, "reward_std": 0.12510719895362854, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9716796875, "rewards/tag_count_reward/std": 0.14138229191303253, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1922.0, "completions/mean_length": 456.40625, "completions/mean_terminated_length": 450.16473388671875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.2583741620182991, "epoch": 0.1898258791396381, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10518332852615005, "learning_rate": 9.782398209778744e-07, "loss": 0.0178, "num_tokens": 260620593.0, "reward": 1.076171875, "reward_std": 0.11762133240699768, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.051725368946790695, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 453.720703125, "completions/mean_terminated_length": 444.3241882324219, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.2521764673292637, "epoch": 0.19016729259132809, "frac_reward_zero_std": 0.5625, "grad_norm": 0.14109999443315271, "learning_rate": 9.78074760494584e-07, "loss": 0.0459, "num_tokens": 260927234.0, "reward": 1.12255859375, "reward_std": 0.15029898285865784, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1949.0, "completions/mean_length": 539.880859375, "completions/mean_terminated_length": 528.0059204101562, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.21621771529316902, "epoch": 0.1905087060430181, "frac_reward_zero_std": 0.71875, "grad_norm": 0.0804897912981415, "learning_rate": 9.77909091976398e-07, "loss": 0.0396, "num_tokens": 261280357.0, "reward": 1.068359375, "reward_std": 0.10900794714689255, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1307.0, "completions/mean_length": 438.96875, "completions/mean_terminated_length": 429.48529052734375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2887233719229698, "epoch": 0.1908501194947081, "frac_reward_zero_std": 0.71875, "grad_norm": 0.12386324566328273, "learning_rate": 9.77742815658631e-07, "loss": 0.0473, "num_tokens": 261579541.0, "reward": 1.0302734375, "reward_std": 0.0874951183795929, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.1843547374010086, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1939.0, "completions/mean_length": 454.501953125, "completions/mean_terminated_length": 435.60675048828125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.25598785653710365, "epoch": 0.1911915329463981, "frac_reward_zero_std": 0.5, "grad_norm": 0.15814033003985933, "learning_rate": 9.775759317774608e-07, "loss": 0.0738, "num_tokens": 261885670.0, "reward": 1.15625, "reward_std": 0.1779462993144989, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.07459938526153564, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1630.0, "completions/mean_length": 477.775390625, "completions/mean_terminated_length": 468.5206604003906, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.23695087060332298, "epoch": 0.19153294639808807, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12733444261949098, "learning_rate": 9.774084405699285e-07, "loss": 0.0451, "num_tokens": 262203347.0, "reward": 1.10302734375, "reward_std": 0.16535036265850067, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1649.0, "completions/mean_length": 462.814453125, "completions/mean_terminated_length": 456.5980529785156, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.23749813437461853, "epoch": 0.19187435984977808, "frac_reward_zero_std": 0.625, "grad_norm": 0.11278046019806277, "learning_rate": 9.772403422739374e-07, "loss": 0.0388, "num_tokens": 262522036.0, "reward": 1.1083984375, "reward_std": 0.14034666121006012, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1598.0, "completions/mean_length": 454.87890625, "completions/mean_terminated_length": 445.48919677734375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.2563861906528473, "epoch": 0.19221577330146808, "frac_reward_zero_std": 0.75, "grad_norm": 0.10654178542656982, "learning_rate": 9.770716371282538e-07, "loss": 0.0817, "num_tokens": 262830438.0, "reward": 1.068359375, "reward_std": 0.07894542813301086, "rewards/accuracy_reward/mean": 0.07661290466785431, "rewards/accuracy_reward/std": 0.2662447690963745, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 496.55078125, "completions/mean_terminated_length": 481.25048828125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.21017813682556152, "epoch": 0.19255718675315808, "frac_reward_zero_std": 0.5, "grad_norm": 0.12410877973017467, "learning_rate": 9.769023253725047e-07, "loss": 0.0623, "num_tokens": 263171840.0, "reward": 1.14697265625, "reward_std": 0.1563940793275833, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1767.0, "completions/mean_length": 482.18359375, "completions/mean_terminated_length": 469.8543395996094, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.23440641909837723, "epoch": 0.19289860020484806, "frac_reward_zero_std": 0.625, "grad_norm": 0.1020400663439176, "learning_rate": 9.767324072471803e-07, "loss": 0.0428, "num_tokens": 263493390.0, "reward": 1.13916015625, "reward_std": 0.13838869333267212, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1621.0, "completions/mean_length": 484.66015625, "completions/mean_terminated_length": 469.24261474609375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.2366119809448719, "epoch": 0.19324001365653806, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13500720310250341, "learning_rate": 9.76561882993631e-07, "loss": 0.0563, "num_tokens": 263823872.0, "reward": 1.1005859375, "reward_std": 0.15378868579864502, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.08079168945550919, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1725.0, "completions/mean_length": 445.439453125, "completions/mean_terminated_length": 426.4466552734375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.23929695785045624, "epoch": 0.19358142710822807, "frac_reward_zero_std": 0.71875, "grad_norm": 0.10472452429162611, "learning_rate": 9.763907528540684e-07, "loss": 0.0256, "num_tokens": 264136337.0, "reward": 1.095703125, "reward_std": 0.09830731153488159, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1042603924870491, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1716.0, "completions/mean_length": 446.904296875, "completions/mean_terminated_length": 443.7710266113281, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.24554229900240898, "epoch": 0.19392284055991807, "frac_reward_zero_std": 0.625, "grad_norm": 0.1297425571166084, "learning_rate": 9.762190170715649e-07, "loss": 0.0057, "num_tokens": 264438448.0, "reward": 1.14697265625, "reward_std": 0.1395510733127594, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 521.78515625, "completions/mean_terminated_length": 509.7677001953125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.2176627516746521, "epoch": 0.19426425401160805, "frac_reward_zero_std": 0.75, "grad_norm": 0.08131079135976302, "learning_rate": 9.760466758900526e-07, "loss": 0.0384, "num_tokens": 264778514.0, "reward": 1.083984375, "reward_std": 0.08680853992700577, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1954.0, "completions/mean_length": 457.912109375, "completions/mean_terminated_length": 451.6764831542969, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.24595008417963982, "epoch": 0.19460566746329805, "frac_reward_zero_std": 0.75, "grad_norm": 0.08602090687018682, "learning_rate": 9.758737295543246e-07, "loss": 0.0357, "num_tokens": 265086261.0, "reward": 1.1201171875, "reward_std": 0.0955268144607544, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 446.263671875, "completions/mean_terminated_length": 443.129150390625, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.2722974792122841, "epoch": 0.19494708091498805, "frac_reward_zero_std": 0.71875, "grad_norm": 0.10657223807225448, "learning_rate": 9.757001783100323e-07, "loss": 0.0377, "num_tokens": 265399308.0, "reward": 1.08056640625, "reward_std": 0.09335482120513916, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1280.0, "completions/mean_length": 407.484375, "completions/mean_terminated_length": 404.2739562988281, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.25222451239824295, "epoch": 0.19528849436667806, "frac_reward_zero_std": 0.6875, "grad_norm": 0.13409361655028804, "learning_rate": 9.755260224036872e-07, "loss": 0.02, "num_tokens": 265684564.0, "reward": 1.1494140625, "reward_std": 0.11982448399066925, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1488.0, "completions/mean_length": 480.912109375, "completions/mean_terminated_length": 477.84539794921875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.2236756682395935, "epoch": 0.19562990781836803, "frac_reward_zero_std": 0.8125, "grad_norm": 0.062251186578712044, "learning_rate": 9.753512620826592e-07, "loss": 0.0141, "num_tokens": 266007319.0, "reward": 1.03564453125, "reward_std": 0.06991732120513916, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1548.0, "completions/mean_length": 489.193359375, "completions/mean_terminated_length": 480.00592041015625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.2460326999425888, "epoch": 0.19597132127005804, "frac_reward_zero_std": 0.65625, "grad_norm": 0.14269820159300897, "learning_rate": 9.751758975951767e-07, "loss": 0.0214, "num_tokens": 266334618.0, "reward": 1.0615234375, "reward_std": 0.11679846793413162, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1571.0, "completions/mean_length": 471.640625, "completions/mean_terminated_length": 459.22833251953125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.22191151976585388, "epoch": 0.19631273472174804, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1190723002657753, "learning_rate": 9.749999291903267e-07, "loss": 0.0551, "num_tokens": 266650178.0, "reward": 1.19921875, "reward_std": 0.16547030210494995, "rewards/accuracy_reward/mean": 0.205078125, "rewards/accuracy_reward/std": 0.4041535556316376, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1829.0, "completions/mean_length": 508.87890625, "completions/mean_terminated_length": 496.75982666015625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.21814818307757378, "epoch": 0.19665414817343804, "frac_reward_zero_std": 0.625, "grad_norm": 0.11585627538804008, "learning_rate": 9.748233571180536e-07, "loss": 0.0705, "num_tokens": 266995684.0, "reward": 1.0654296875, "reward_std": 0.12427891045808792, "rewards/accuracy_reward/mean": 0.07258064299821854, "rewards/accuracy_reward/std": 0.25970885157585144, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1645.0, "completions/mean_length": 467.359375, "completions/mean_terminated_length": 461.1607971191406, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.24227293580770493, "epoch": 0.19699556162512802, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10527957481900319, "learning_rate": 9.74646181629159e-07, "loss": 0.029, "num_tokens": 267314764.0, "reward": 1.1064453125, "reward_std": 0.11979028582572937, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1605.0, "completions/mean_length": 464.04296875, "completions/mean_terminated_length": 457.8313903808594, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.23903603479266167, "epoch": 0.19733697507681802, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1133663430515894, "learning_rate": 9.744684029753026e-07, "loss": 0.0046, "num_tokens": 267628786.0, "reward": 1.1669921875, "reward_std": 0.14730705320835114, "rewards/accuracy_reward/mean": 0.17540322244167328, "rewards/accuracy_reward/std": 0.3806955814361572, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1718.0, "completions/mean_length": 471.162109375, "completions/mean_terminated_length": 461.8683776855469, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.25821832567453384, "epoch": 0.19767838852850803, "frac_reward_zero_std": 0.65625, "grad_norm": 0.13064974947448488, "learning_rate": 9.742900214089994e-07, "loss": 0.038, "num_tokens": 267949397.0, "reward": 1.10205078125, "reward_std": 0.12206517159938812, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1621.0, "completions/mean_length": 489.2890625, "completions/mean_terminated_length": 486.2387390136719, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.2194036804139614, "epoch": 0.19801980198019803, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10384999144885851, "learning_rate": 9.741110371836224e-07, "loss": -0.0074, "num_tokens": 268278889.0, "reward": 1.12451171875, "reward_std": 0.1476087123155594, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1875.0, "completions/mean_length": 474.857421875, "completions/mean_terminated_length": 468.6882629394531, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.227583397179842, "epoch": 0.198361215431888, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10647958427250592, "learning_rate": 9.739314505533989e-07, "loss": 0.0305, "num_tokens": 268603888.0, "reward": 1.09033203125, "reward_std": 0.09790096431970596, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2041.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 444.658203125, "completions/mean_terminated_length": 444.658203125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.24521810188889503, "epoch": 0.198702628883578, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1142131511348983, "learning_rate": 9.73751261773413e-07, "loss": -0.0252, "num_tokens": 268917617.0, "reward": 1.111328125, "reward_std": 0.12405378371477127, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1554.0, "completions/mean_length": 471.794921875, "completions/mean_terminated_length": 453.1047668457031, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.22149859741330147, "epoch": 0.199044042335268, "frac_reward_zero_std": 0.5, "grad_norm": 0.14624084379827698, "learning_rate": 9.735704710996043e-07, "loss": 0.1017, "num_tokens": 269230568.0, "reward": 1.10595703125, "reward_std": 0.16887986660003662, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99072265625, "rewards/tag_count_reward/std": 0.0814909115433693, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1437.0, "completions/mean_length": 470.908203125, "completions/mean_terminated_length": 461.61297607421875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.2519678771495819, "epoch": 0.19938545578695802, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11233456379739339, "learning_rate": 9.73389078788766e-07, "loss": 0.0349, "num_tokens": 269553625.0, "reward": 1.09130859375, "reward_std": 0.12195862829685211, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1363.0, "completions/mean_length": 384.193359375, "completions/mean_terminated_length": 371.092529296875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.2583496943116188, "epoch": 0.199726869238648, "frac_reward_zero_std": 0.46875, "grad_norm": 0.19081434692512136, "learning_rate": 9.732070850985472e-07, "loss": 0.061, "num_tokens": 269819644.0, "reward": 1.13134765625, "reward_std": 0.17565369606018066, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.0704338550567627, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1971.0, "completions/mean_length": 466.419921875, "completions/mean_terminated_length": 463.3248596191406, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.22463556751608849, "epoch": 0.200068282690338, "frac_reward_zero_std": 0.71875, "grad_norm": 0.10092455839043303, "learning_rate": 9.730244902874507e-07, "loss": -0.022, "num_tokens": 270136435.0, "reward": 1.09423828125, "reward_std": 0.1100165992975235, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 492.541015625, "completions/mean_terminated_length": 480.2933044433594, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.20338797941803932, "epoch": 0.200409696142028, "frac_reward_zero_std": 0.65625, "grad_norm": 0.0895706089846684, "learning_rate": 9.728412946148327e-07, "loss": 0.0978, "num_tokens": 270470856.0, "reward": 1.0517578125, "reward_std": 0.10992145538330078, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.08079168945550919, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1238.0, "completions/max_terminated_length": 1238.0, "completions/mean_length": 401.314453125, "completions/mean_terminated_length": 401.314453125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.2525404766201973, "epoch": 0.200751109593718, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12612823372310122, "learning_rate": 9.726574983409039e-07, "loss": -0.0165, "num_tokens": 270752057.0, "reward": 1.107421875, "reward_std": 0.13416439294815063, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1247.0, "completions/mean_length": 434.3984375, "completions/mean_terminated_length": 428.07061767578125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.2411617822945118, "epoch": 0.20109252304540798, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12189499389932182, "learning_rate": 9.724731017267267e-07, "loss": 0.0189, "num_tokens": 271052517.0, "reward": 1.1767578125, "reward_std": 0.17046988010406494, "rewards/accuracy_reward/mean": 0.1854838728904724, "rewards/accuracy_reward/std": 0.38908204436302185, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1094.0, "completions/max_terminated_length": 1094.0, "completions/mean_length": 370.689453125, "completions/mean_terminated_length": 370.689453125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.25820572301745415, "epoch": 0.20143393649709798, "frac_reward_zero_std": 0.65625, "grad_norm": 0.142136366607818, "learning_rate": 9.722881050342175e-07, "loss": -0.0141, "num_tokens": 271309782.0, "reward": 1.193359375, "reward_std": 0.1417127251625061, "rewards/accuracy_reward/mean": 0.193359375, "rewards/accuracy_reward/std": 0.39531853795051575, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1271.0, "completions/max_terminated_length": 1271.0, "completions/mean_length": 427.345703125, "completions/mean_terminated_length": 427.345703125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2473033107817173, "epoch": 0.20177534994878799, "frac_reward_zero_std": 0.78125, "grad_norm": 0.0973527391763799, "learning_rate": 9.721025085261442e-07, "loss": 0.0067, "num_tokens": 271610247.0, "reward": 1.134765625, "reward_std": 0.07885697484016418, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1871.0, "completions/mean_length": 419.998046875, "completions/mean_terminated_length": 407.17913818359375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.2456602193415165, "epoch": 0.202116763400478, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12171250335079639, "learning_rate": 9.719163124661276e-07, "loss": 0.0491, "num_tokens": 271898966.0, "reward": 1.126953125, "reward_std": 0.16393029689788818, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1733.0, "completions/mean_length": 442.833984375, "completions/mean_terminated_length": 433.373291015625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.241958636790514, "epoch": 0.20245817685216796, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11086882995481945, "learning_rate": 9.717295171186388e-07, "loss": 0.0534, "num_tokens": 272207441.0, "reward": 1.08544921875, "reward_std": 0.11210784316062927, "rewards/accuracy_reward/mean": 0.0927419364452362, "rewards/accuracy_reward/std": 0.2903633117675781, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 437.240234375, "completions/mean_terminated_length": 427.74658203125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.1926196999847889, "epoch": 0.20279959030385797, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12786572304964824, "learning_rate": 9.71542122749001e-07, "loss": -0.0044, "num_tokens": 272501900.0, "reward": 1.23388671875, "reward_std": 0.14807216823101044, "rewards/accuracy_reward/mean": 0.23828125, "rewards/accuracy_reward/std": 0.42644867300987244, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1806.0, "completions/mean_length": 459.119140625, "completions/mean_terminated_length": 456.0097961425781, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.22723442688584328, "epoch": 0.20314100375554797, "frac_reward_zero_std": 0.53125, "grad_norm": 0.13664503726348115, "learning_rate": 9.713541296233884e-07, "loss": -0.005, "num_tokens": 272811785.0, "reward": 1.14111328125, "reward_std": 0.18652145564556122, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1621.0, "completions/mean_length": 461.35546875, "completions/mean_terminated_length": 458.25048828125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.2370263896882534, "epoch": 0.20348241720723798, "frac_reward_zero_std": 0.625, "grad_norm": 0.11982693352111308, "learning_rate": 9.711655380088249e-07, "loss": 0.0093, "num_tokens": 273113951.0, "reward": 1.1845703125, "reward_std": 0.15087538957595825, "rewards/accuracy_reward/mean": 0.19153225421905518, "rewards/accuracy_reward/std": 0.3939041793346405, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.022097086533904076, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1487.0, "completions/mean_length": 507.388671875, "completions/mean_terminated_length": 498.3084716796875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.20945081859827042, "epoch": 0.20382383065892795, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12126893111385552, "learning_rate": 9.709763481731853e-07, "loss": 0.0192, "num_tokens": 273445638.0, "reward": 1.14697265625, "reward_std": 0.1515788733959198, "rewards/accuracy_reward/mean": 0.1572580635547638, "rewards/accuracy_reward/std": 0.36441144347190857, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.06134068965911865, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1638.0, "completions/mean_length": 466.0234375, "completions/mean_terminated_length": 462.9275817871094, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.228700153529644, "epoch": 0.20416524411061795, "frac_reward_zero_std": 0.8125, "grad_norm": 0.08226214661802791, "learning_rate": 9.707865603851936e-07, "loss": 0.0088, "num_tokens": 273766674.0, "reward": 1.17822265625, "reward_std": 0.0771043598651886, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.38430243730545044, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1937.0, "completions/max_terminated_length": 1937.0, "completions/mean_length": 437.802734375, "completions/mean_terminated_length": 437.802734375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.22349407523870468, "epoch": 0.20450665756230796, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12850403979062677, "learning_rate": 9.70596174914423e-07, "loss": 0.0213, "num_tokens": 274076461.0, "reward": 1.13720703125, "reward_std": 0.11489619314670563, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.024685947224497795, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 453.9765625, "completions/mean_terminated_length": 444.58154296875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.22712574526667595, "epoch": 0.20484807101399796, "frac_reward_zero_std": 0.5625, "grad_norm": 0.15281832987423247, "learning_rate": 9.704051920312964e-07, "loss": 0.0434, "num_tokens": 274388865.0, "reward": 1.15966796875, "reward_std": 0.15538254380226135, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1835.0, "completions/max_terminated_length": 1835.0, "completions/mean_length": 476.33984375, "completions/mean_terminated_length": 476.33984375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2230435237288475, "epoch": 0.20518948446568794, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12979024531498495, "learning_rate": 9.702136120070845e-07, "loss": -0.0369, "num_tokens": 274708879.0, "reward": 1.197265625, "reward_std": 0.17114542424678802, "rewards/accuracy_reward/mean": 0.197265625, "rewards/accuracy_reward/std": 0.3983237147331238, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1510.0, "completions/max_terminated_length": 1510.0, "completions/mean_length": 432.51953125, "completions/mean_terminated_length": 432.51953125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.2693590670824051, "epoch": 0.20553089791737794, "frac_reward_zero_std": 0.875, "grad_norm": 0.06762586662585685, "learning_rate": 9.700214351139064e-07, "loss": -0.0033, "num_tokens": 275009257.0, "reward": 1.09765625, "reward_std": 0.036034777760505676, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1936.0, "completions/mean_length": 481.8984375, "completions/mean_terminated_length": 475.75689697265625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.22585934773087502, "epoch": 0.20587231136906795, "frac_reward_zero_std": 0.53125, "grad_norm": 0.129230009326872, "learning_rate": 9.69828661624729e-07, "loss": 0.0009, "num_tokens": 275330837.0, "reward": 1.17333984375, "reward_std": 0.17536255717277527, "rewards/accuracy_reward/mean": 0.177734375, "rewards/accuracy_reward/std": 0.3826628625392914, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05285605043172836, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1906.0, "completions/mean_length": 519.8046875, "completions/mean_terminated_length": 513.811767578125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.22098911926150322, "epoch": 0.20621372482075795, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11566817864101049, "learning_rate": 9.696352918133672e-07, "loss": -0.0124, "num_tokens": 275675953.0, "reward": 1.1220703125, "reward_std": 0.13083884119987488, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.056179627776145935, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1889.0, "completions/mean_length": 549.53515625, "completions/mean_terminated_length": 525.75, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.18511221557855606, "epoch": 0.20655513827244792, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09574435632600722, "learning_rate": 9.694413259544815e-07, "loss": 0.0537, "num_tokens": 276030435.0, "reward": 1.2138671875, "reward_std": 0.13473767042160034, "rewards/accuracy_reward/mean": 0.224609375, "rewards/accuracy_reward/std": 0.41773295402526855, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9892578125, "rewards/tag_count_reward/std": 0.08781895041465759, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1443.0, "completions/mean_length": 478.205078125, "completions/mean_terminated_length": 475.133056640625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.21272239461541176, "epoch": 0.20689655172413793, "frac_reward_zero_std": 0.625, "grad_norm": 0.11995041476653696, "learning_rate": 9.692467643235805e-07, "loss": -0.007, "num_tokens": 276353052.0, "reward": 1.08642578125, "reward_std": 0.13712327182292938, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1637.0, "completions/mean_length": 513.26953125, "completions/mean_terminated_length": 510.2661437988281, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.20607471093535423, "epoch": 0.20723796517582793, "frac_reward_zero_std": 0.75, "grad_norm": 0.09018971164573115, "learning_rate": 9.690516071970182e-07, "loss": 0.0202, "num_tokens": 276690070.0, "reward": 1.1015625, "reward_std": 0.08356846868991852, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1462.0, "completions/mean_length": 497.841796875, "completions/mean_terminated_length": 491.7627868652344, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.22948677092790604, "epoch": 0.20757937862751794, "frac_reward_zero_std": 0.84375, "grad_norm": 0.08430619105229949, "learning_rate": 9.688558548519946e-07, "loss": 0.0038, "num_tokens": 277024645.0, "reward": 1.0947265625, "reward_std": 0.06470514088869095, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1352.0, "completions/mean_length": 466.814453125, "completions/mean_terminated_length": 441.71630859375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.23839197680354118, "epoch": 0.2079207920792079, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12407025667725478, "learning_rate": 9.686595075665552e-07, "loss": 0.0578, "num_tokens": 277340406.0, "reward": 1.099609375, "reward_std": 0.15175989270210266, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.986328125, "rewards/tag_count_reward/std": 0.09796657413244247, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1981.0, "completions/mean_length": 562.14453125, "completions/mean_terminated_length": 541.5584106445312, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.2024935819208622, "epoch": 0.20826220553089791, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10759815885143152, "learning_rate": 9.684625656195908e-07, "loss": 0.0071, "num_tokens": 277715312.0, "reward": 1.1513671875, "reward_std": 0.18473950028419495, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3776407241821289, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9794921875, "rewards/tag_count_reward/std": 0.10639684647321701, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1388.0, "completions/max_terminated_length": 1388.0, "completions/mean_length": 504.958984375, "completions/mean_terminated_length": 504.958984375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.2572329118847847, "epoch": 0.20860361898258792, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12186884845863437, "learning_rate": 9.682650292908362e-07, "loss": -0.0035, "num_tokens": 278052971.0, "reward": 1.138671875, "reward_std": 0.15257751941680908, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1937.0, "completions/mean_length": 521.25390625, "completions/mean_terminated_length": 515.2667236328125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.20919125527143478, "epoch": 0.20894503243427792, "frac_reward_zero_std": 0.71875, "grad_norm": 0.09045699448029616, "learning_rate": 9.680668988608708e-07, "loss": 0.0009, "num_tokens": 278404045.0, "reward": 1.0478515625, "reward_std": 0.09721650183200836, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.041276250034570694, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1610.0, "completions/mean_length": 522.79296875, "completions/mean_terminated_length": 516.811767578125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.20484806969761848, "epoch": 0.2092864458859679, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10043708750170836, "learning_rate": 9.678681746111186e-07, "loss": 0.0281, "num_tokens": 278743443.0, "reward": 1.1416015625, "reward_std": 0.13971181213855743, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 511.599609375, "completions/mean_terminated_length": 499.501953125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.2290508858859539, "epoch": 0.2096278593376579, "frac_reward_zero_std": 0.625, "grad_norm": 0.11080699951538556, "learning_rate": 9.676688568238456e-07, "loss": 0.0232, "num_tokens": 279083510.0, "reward": 1.14453125, "reward_std": 0.14132508635520935, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 534.455078125, "completions/mean_terminated_length": 488.80078125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.2416735664010048, "epoch": 0.2099692727893479, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13419059986597012, "learning_rate": 9.67468945782162e-07, "loss": 0.0302, "num_tokens": 279438559.0, "reward": 1.130859375, "reward_std": 0.16049370169639587, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.36344730854034424, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.974609375, "rewards/tag_count_reward/std": 0.1311914473772049, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 464.65234375, "completions/mean_terminated_length": 452.1850280761719, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2595118582248688, "epoch": 0.2103106862410379, "frac_reward_zero_std": 0.625, "grad_norm": 0.13603342868135831, "learning_rate": 9.672684417700203e-07, "loss": 0.0604, "num_tokens": 279760285.0, "reward": 1.08984375, "reward_std": 0.12750549614429474, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1706.0, "completions/mean_length": 509.0703125, "completions/mean_terminated_length": 506.0587158203125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2379816733300686, "epoch": 0.21065209969272788, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12460435837344809, "learning_rate": 9.67067345072215e-07, "loss": 0.0015, "num_tokens": 280103793.0, "reward": 1.15087890625, "reward_std": 0.18938972055912018, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 526.845703125, "completions/mean_terminated_length": 511.84417724609375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.22317499294877052, "epoch": 0.2109935131444179, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11488450944097273, "learning_rate": 9.668656559743827e-07, "loss": 0.0475, "num_tokens": 280442674.0, "reward": 1.20654296875, "reward_std": 0.14573580026626587, "rewards/accuracy_reward/mean": 0.212890625, "rewards/accuracy_reward/std": 0.409751296043396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.06697065383195877, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1800.0, "completions/mean_length": 502.15625, "completions/mean_terminated_length": 486.9112548828125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.22746043652296066, "epoch": 0.2113349265961079, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11965078995170976, "learning_rate": 9.666633747630017e-07, "loss": 0.0578, "num_tokens": 280778146.0, "reward": 1.11181640625, "reward_std": 0.10846738517284393, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1485.0, "completions/max_terminated_length": 1485.0, "completions/mean_length": 458.4765625, "completions/mean_terminated_length": 458.4765625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.24563685059547424, "epoch": 0.2116763400477979, "frac_reward_zero_std": 0.65625, "grad_norm": 0.13621238704714822, "learning_rate": 9.66460501725391e-07, "loss": -0.0153, "num_tokens": 281090358.0, "reward": 1.1015625, "reward_std": 0.12780256569385529, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1585.0, "completions/mean_length": 507.966796875, "completions/mean_terminated_length": 501.927490234375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.22758925706148148, "epoch": 0.21201775349948787, "frac_reward_zero_std": 0.5, "grad_norm": 0.14668703934783764, "learning_rate": 9.662570371497098e-07, "loss": 0.0291, "num_tokens": 281428309.0, "reward": 1.1708984375, "reward_std": 0.19847439229488373, "rewards/accuracy_reward/mean": 0.173828125, "rewards/accuracy_reward/std": 0.3793322443962097, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1460.0, "completions/mean_length": 457.087890625, "completions/mean_terminated_length": 450.84906005859375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.25817833095788956, "epoch": 0.21235916695117787, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1455153438789572, "learning_rate": 9.660529813249586e-07, "loss": 0.0435, "num_tokens": 281740482.0, "reward": 1.1884765625, "reward_std": 0.18171073496341705, "rewards/accuracy_reward/mean": 0.19140625, "rewards/accuracy_reward/std": 0.3937928080558777, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1826.0, "completions/mean_length": 431.193359375, "completions/mean_terminated_length": 428.02935791015625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.25527654588222504, "epoch": 0.21270058040286788, "frac_reward_zero_std": 0.53125, "grad_norm": 0.147313966897961, "learning_rate": 9.65848334540977e-07, "loss": 0.0133, "num_tokens": 282035013.0, "reward": 1.22900390625, "reward_std": 0.20427241921424866, "rewards/accuracy_reward/mean": 0.23046875, "rewards/accuracy_reward/std": 0.42154473066329956, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1887.0, "completions/mean_length": 508.841796875, "completions/mean_terminated_length": 465.5943603515625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.22672826796770096, "epoch": 0.21304199385455788, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12464427124197641, "learning_rate": 9.656430970884437e-07, "loss": 0.0521, "num_tokens": 282384964.0, "reward": 1.15087890625, "reward_std": 0.16392357647418976, "rewards/accuracy_reward/mean": 0.17943547666072845, "rewards/accuracy_reward/std": 0.3841039538383484, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97705078125, "rewards/tag_count_reward/std": 0.11946428567171097, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 573.447265625, "completions/mean_terminated_length": 525.9092407226562, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.22465117648243904, "epoch": 0.21338340730624786, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1254837510539888, "learning_rate": 9.65437269258877e-07, "loss": 0.0188, "num_tokens": 282757289.0, "reward": 1.1083984375, "reward_std": 0.16031306982040405, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9755859375, "rewards/tag_count_reward/std": 0.12950171530246735, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1888.0, "completions/mean_length": 473.130859375, "completions/mean_terminated_length": 466.9549255371094, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.224904116243124, "epoch": 0.21372482075793786, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11074023122516247, "learning_rate": 9.652308513446339e-07, "loss": 0.0422, "num_tokens": 283079724.0, "reward": 1.1396484375, "reward_std": 0.13638056814670563, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1744.0, "completions/mean_length": 499.859375, "completions/mean_terminated_length": 496.8297424316406, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.237799234688282, "epoch": 0.21406623420962786, "frac_reward_zero_std": 0.59375, "grad_norm": 0.2181870555743113, "learning_rate": 9.650238436389088e-07, "loss": 0.0121, "num_tokens": 283414212.0, "reward": 1.14697265625, "reward_std": 0.16173049807548523, "rewards/accuracy_reward/mean": 0.15322580933570862, "rewards/accuracy_reward/std": 0.36056873202323914, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1951.0, "completions/mean_length": 471.337890625, "completions/mean_terminated_length": 462.0451965332031, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2175065614283085, "epoch": 0.21440764766131787, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12108840325163041, "learning_rate": 9.648162464357344e-07, "loss": 0.0275, "num_tokens": 283735153.0, "reward": 1.1083984375, "reward_std": 0.1341843605041504, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1651.0, "completions/mean_length": 456.427734375, "completions/mean_terminated_length": 443.8956604003906, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2656674310564995, "epoch": 0.21474906111300784, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1403677615956696, "learning_rate": 9.646080600299802e-07, "loss": 0.0644, "num_tokens": 284051932.0, "reward": 1.0625, "reward_std": 0.13740524649620056, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1950.0, "completions/mean_length": 465.193359375, "completions/mean_terminated_length": 452.7303161621094, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.23089142888784409, "epoch": 0.21509047456469785, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12543564926543688, "learning_rate": 9.643992847173535e-07, "loss": 0.0631, "num_tokens": 284363071.0, "reward": 1.1328125, "reward_std": 0.15989968180656433, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1480.0, "completions/mean_length": 523.412109375, "completions/mean_terminated_length": 511.407470703125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.2309400960803032, "epoch": 0.21543188801638785, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13113563690084676, "learning_rate": 9.641899207943971e-07, "loss": 0.0516, "num_tokens": 284705474.0, "reward": 1.126953125, "reward_std": 0.16995996236801147, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1660.0, "completions/mean_length": 498.169921875, "completions/mean_terminated_length": 495.1369934082031, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.21041805669665337, "epoch": 0.21577330146807785, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12459207324040632, "learning_rate": 9.639799685584907e-07, "loss": -0.0143, "num_tokens": 285037737.0, "reward": 1.26220703125, "reward_std": 0.212809756398201, "rewards/accuracy_reward/mean": 0.263671875, "rewards/accuracy_reward/std": 0.4410543739795685, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1401.0, "completions/mean_length": 485.443359375, "completions/mean_terminated_length": 482.3855285644531, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.21669244393706322, "epoch": 0.21611471491976783, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11744646087219215, "learning_rate": 9.63769428307849e-07, "loss": 0.0144, "num_tokens": 285359788.0, "reward": 1.14501953125, "reward_std": 0.12440601736307144, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1762.0, "completions/max_terminated_length": 1762.0, "completions/mean_length": 451.533203125, "completions/mean_terminated_length": 451.533203125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.2264413982629776, "epoch": 0.21645612837145783, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11236683133382878, "learning_rate": 9.63558300341522e-07, "loss": -0.0094, "num_tokens": 285663837.0, "reward": 1.1494140625, "reward_std": 0.1564304679632187, "rewards/accuracy_reward/mean": 0.1552419364452362, "rewards/accuracy_reward/std": 0.36250078678131104, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.022097086533904076, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 514.041015625, "completions/mean_terminated_length": 495.851806640625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.24984141439199448, "epoch": 0.21679754182314784, "frac_reward_zero_std": 0.625, "grad_norm": 0.12646382427209624, "learning_rate": 9.63346584959395e-07, "loss": 0.0588, "num_tokens": 286006306.0, "reward": 1.0498046875, "reward_std": 0.12866827845573425, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.08079168945550919, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1274.0, "completions/mean_length": 492.59375, "completions/mean_terminated_length": 483.42633056640625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.23177236318588257, "epoch": 0.21713895527483784, "frac_reward_zero_std": 0.75, "grad_norm": 0.08332152117746659, "learning_rate": 9.63134282462187e-07, "loss": 0.0384, "num_tokens": 286334578.0, "reward": 1.11083984375, "reward_std": 0.08990012109279633, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1876.0, "completions/mean_length": 513.9609375, "completions/mean_terminated_length": 507.94512939453125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.2018139287829399, "epoch": 0.21748036872652782, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12374733862199454, "learning_rate": 9.629213931514513e-07, "loss": 0.0324, "num_tokens": 286677518.0, "reward": 1.0947265625, "reward_std": 0.11598433554172516, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1868.0, "completions/mean_length": 475.439453125, "completions/mean_terminated_length": 472.3620300292969, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2085774689912796, "epoch": 0.21782178217821782, "frac_reward_zero_std": 0.625, "grad_norm": 0.11845405228464201, "learning_rate": 9.627079173295747e-07, "loss": 0.0097, "num_tokens": 286989439.0, "reward": 1.22314453125, "reward_std": 0.153598815202713, "rewards/accuracy_reward/mean": 0.23185484111309052, "rewards/accuracy_reward/std": 0.42244285345077515, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1542.0, "completions/mean_length": 461.32421875, "completions/mean_terminated_length": 455.10198974609375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.21493497118353844, "epoch": 0.21816319562990782, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14095172212122795, "learning_rate": 9.62493855299777e-07, "loss": 0.0002, "num_tokens": 287298837.0, "reward": 1.18017578125, "reward_std": 0.15117445588111877, "rewards/accuracy_reward/mean": 0.18359375, "rewards/accuracy_reward/std": 0.3875311613082886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1343.0, "completions/mean_length": 459.91796875, "completions/mean_terminated_length": 456.8101806640625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.23139140754938126, "epoch": 0.21850460908159783, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1396145913175417, "learning_rate": 9.622792073661107e-07, "loss": 0.0152, "num_tokens": 287610091.0, "reward": 1.15087890625, "reward_std": 0.1380678117275238, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1944.0, "completions/mean_length": 512.98046875, "completions/mean_terminated_length": 482.40240478515625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.2151733674108982, "epoch": 0.2188460225332878, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1531465304478703, "learning_rate": 9.620639738334602e-07, "loss": 0.0451, "num_tokens": 287947377.0, "reward": 1.1064453125, "reward_std": 0.15691068768501282, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9853515625, "rewards/tag_count_reward/std": 0.10388854891061783, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1447.0, "completions/mean_length": 408.931640625, "completions/mean_terminated_length": 402.5039367675781, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.26489177346229553, "epoch": 0.2191874359849778, "frac_reward_zero_std": 0.625, "grad_norm": 0.15024439823429478, "learning_rate": 9.618481550075423e-07, "loss": 0.015, "num_tokens": 288234318.0, "reward": 1.10205078125, "reward_std": 0.13239756226539612, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1898.0, "completions/mean_length": 501.79296875, "completions/mean_terminated_length": 495.72943115234375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.21808556839823723, "epoch": 0.2195288494366678, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11586885434287658, "learning_rate": 9.616317511949047e-07, "loss": 0.0127, "num_tokens": 288570964.0, "reward": 1.11962890625, "reward_std": 0.1328050047159195, "rewards/accuracy_reward/mean": 0.1270161271095276, "rewards/accuracy_reward/std": 0.3333272337913513, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1777.0, "completions/mean_length": 473.541015625, "completions/mean_terminated_length": 470.4598693847656, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.22123106196522713, "epoch": 0.21987026288835781, "frac_reward_zero_std": 0.59375, "grad_norm": 0.131463885201285, "learning_rate": 9.61414762702926e-07, "loss": 0.0022, "num_tokens": 288887417.0, "reward": 1.11767578125, "reward_std": 0.14571963250637054, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1538.0, "completions/mean_length": 463.115234375, "completions/mean_terminated_length": 456.9000244140625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.22643610462546349, "epoch": 0.2202116763400478, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11732567319980645, "learning_rate": 9.611971898398155e-07, "loss": 0.0033, "num_tokens": 289199412.0, "reward": 1.1318359375, "reward_std": 0.1312960684299469, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 471.7578125, "completions/mean_terminated_length": 468.6731872558594, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2379244528710842, "epoch": 0.2205530897917378, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1323781960335547, "learning_rate": 9.609790329146124e-07, "loss": 0.0149, "num_tokens": 289529816.0, "reward": 1.06884765625, "reward_std": 0.12111242115497589, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1335.0, "completions/mean_length": 487.4453125, "completions/mean_terminated_length": 478.24755859375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2122977375984192, "epoch": 0.2208945032434278, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12287412720742653, "learning_rate": 9.607602922371855e-07, "loss": 0.0432, "num_tokens": 289860348.0, "reward": 1.1220703125, "reward_std": 0.12076111882925034, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1394.0, "completions/mean_length": 448.66015625, "completions/mean_terminated_length": 442.3882751464844, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.22040829807519913, "epoch": 0.2212359166951178, "frac_reward_zero_std": 0.625, "grad_norm": 0.11515226436405408, "learning_rate": 9.605409681182328e-07, "loss": 0.0259, "num_tokens": 290161998.0, "reward": 1.1552734375, "reward_std": 0.13131645321846008, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1609.0, "completions/mean_length": 459.529296875, "completions/mean_terminated_length": 443.8639221191406, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.21773913502693176, "epoch": 0.22157733014680778, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12291347192105102, "learning_rate": 9.60321060869281e-07, "loss": 0.0596, "num_tokens": 290470925.0, "reward": 1.08251953125, "reward_std": 0.1601206660270691, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1706.0, "completions/mean_length": 480.40625, "completions/mean_terminated_length": 468.06298828125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.23543063178658485, "epoch": 0.22191874359849778, "frac_reward_zero_std": 0.625, "grad_norm": 0.1347584602396258, "learning_rate": 9.601005708026851e-07, "loss": 0.02, "num_tokens": 290797629.0, "reward": 1.166015625, "reward_std": 0.1479671597480774, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3776407241821289, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 476.107421875, "completions/mean_terminated_length": 473.03131103515625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.2080899514257908, "epoch": 0.22226015705018778, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12317039599769666, "learning_rate": 9.59879498231628e-07, "loss": 0.0047, "num_tokens": 291114596.0, "reward": 1.13525390625, "reward_std": 0.1665613055229187, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1545.0, "completions/mean_length": 449.236328125, "completions/mean_terminated_length": 442.9667053222656, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.23883434385061264, "epoch": 0.22260157050187776, "frac_reward_zero_std": 0.625, "grad_norm": 0.12441561388025757, "learning_rate": 9.596578434701198e-07, "loss": 0.0392, "num_tokens": 291424941.0, "reward": 1.107421875, "reward_std": 0.12218627333641052, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.051725368946790695, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1531.0, "completions/mean_length": 478.099609375, "completions/mean_terminated_length": 471.94317626953125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.2284850738942623, "epoch": 0.22294298395356776, "frac_reward_zero_std": 0.5625, "grad_norm": 0.14641693215102, "learning_rate": 9.594356068329975e-07, "loss": 0.0118, "num_tokens": 291749280.0, "reward": 1.1552734375, "reward_std": 0.17882013320922852, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1933.0, "completions/mean_length": 487.443359375, "completions/mean_terminated_length": 478.24560546875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.21671683713793755, "epoch": 0.22328439740525777, "frac_reward_zero_std": 0.625, "grad_norm": 0.12337530707044499, "learning_rate": 9.592127886359247e-07, "loss": 0.0187, "num_tokens": 292080451.0, "reward": 1.115234375, "reward_std": 0.14482833445072174, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04930410906672478, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1442.0, "completions/mean_length": 444.333984375, "completions/mean_terminated_length": 441.1956787109375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.2399872988462448, "epoch": 0.22362581085694777, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10021004191893101, "learning_rate": 9.589893891953914e-07, "loss": 0.0248, "num_tokens": 292380526.0, "reward": 1.12451171875, "reward_std": 0.11293382197618484, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.039800092577934265, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1719.0, "completions/max_terminated_length": 1719.0, "completions/mean_length": 449.876953125, "completions/mean_terminated_length": 449.876953125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.2199433073401451, "epoch": 0.22396722430863775, "frac_reward_zero_std": 0.625, "grad_norm": 0.136421203529227, "learning_rate": 9.587654088287128e-07, "loss": -0.0273, "num_tokens": 292686527.0, "reward": 1.2158203125, "reward_std": 0.12913379073143005, "rewards/accuracy_reward/mean": 0.216796875, "rewards/accuracy_reward/std": 0.4124660789966583, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.022097086533904076, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1893.0, "completions/max_terminated_length": 1893.0, "completions/mean_length": 452.009765625, "completions/mean_terminated_length": 452.009765625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.21856653690338135, "epoch": 0.22430863776032775, "frac_reward_zero_std": 0.75, "grad_norm": 0.08755692807631915, "learning_rate": 9.585408478540289e-07, "loss": -0.0003, "num_tokens": 292990356.0, "reward": 1.138671875, "reward_std": 0.10200159251689911, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1597.0, "completions/mean_length": 489.46875, "completions/mean_terminated_length": 474.0986328125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.2171977162361145, "epoch": 0.22465005121201775, "frac_reward_zero_std": 0.5, "grad_norm": 0.14949384310507258, "learning_rate": 9.58315706590305e-07, "loss": 0.0485, "num_tokens": 293309972.0, "reward": 1.15478515625, "reward_std": 0.20721587538719177, "rewards/accuracy_reward/mean": 0.16733871400356293, "rewards/accuracy_reward/std": 0.37365487217903137, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1571.0, "completions/mean_length": 519.650390625, "completions/mean_terminated_length": 516.6594848632812, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.21068145707249641, "epoch": 0.22499146466370776, "frac_reward_zero_std": 0.625, "grad_norm": 0.11797871480190339, "learning_rate": 9.580899853573308e-07, "loss": 0.0052, "num_tokens": 293657697.0, "reward": 1.11279296875, "reward_std": 0.13419198989868164, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.039800092577934265, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1436.0, "completions/max_terminated_length": 1436.0, "completions/mean_length": 497.318359375, "completions/mean_terminated_length": 497.318359375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.19767539203166962, "epoch": 0.22533287811539773, "frac_reward_zero_std": 0.53125, "grad_norm": 0.13152810559726885, "learning_rate": 9.57863684475719e-07, "loss": -0.0162, "num_tokens": 293985972.0, "reward": 1.1708984375, "reward_std": 0.18690776824951172, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3776407241821289, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1689.0, "completions/mean_length": 488.451171875, "completions/mean_terminated_length": 473.0710144042969, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.21647707000374794, "epoch": 0.22567429156708774, "frac_reward_zero_std": 0.53125, "grad_norm": 0.13607174660855104, "learning_rate": 9.576368042669063e-07, "loss": 0.0827, "num_tokens": 294314123.0, "reward": 1.14306640625, "reward_std": 0.1687343418598175, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1690.0, "completions/mean_length": 526.0, "completions/mean_terminated_length": 514.0157470703125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.2102285735309124, "epoch": 0.22601570501877774, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11286606841433773, "learning_rate": 9.574093450531518e-07, "loss": 0.0248, "num_tokens": 294661083.0, "reward": 1.1787109375, "reward_std": 0.1663917899131775, "rewards/accuracy_reward/mean": 0.185546875, "rewards/accuracy_reward/std": 0.38912075757980347, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06960996240377426, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1696.0, "completions/mean_length": 558.50390625, "completions/mean_terminated_length": 543.8145751953125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.20838592574000359, "epoch": 0.22635711847046774, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12035196909048174, "learning_rate": 9.571813071575375e-07, "loss": 0.029, "num_tokens": 295031565.0, "reward": 1.09619140625, "reward_std": 0.16539505124092102, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1981.0, "completions/mean_length": 561.142578125, "completions/mean_terminated_length": 555.311767578125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.1794872134923935, "epoch": 0.22669853192215772, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12533598976402885, "learning_rate": 9.56952690903967e-07, "loss": 0.0349, "num_tokens": 295391542.0, "reward": 1.1552734375, "reward_std": 0.1530238389968872, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 510.4140625, "completions/mean_terminated_length": 498.3070983886719, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.210487499833107, "epoch": 0.22703994537384772, "frac_reward_zero_std": 0.625, "grad_norm": 0.114524400732344, "learning_rate": 9.567234966171651e-07, "loss": 0.058, "num_tokens": 295734170.0, "reward": 1.056640625, "reward_std": 0.1323169469833374, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1523.0, "completions/mean_length": 482.103515625, "completions/mean_terminated_length": 479.03912353515625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.20480836555361748, "epoch": 0.22738135882553773, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1236050023113201, "learning_rate": 9.564937246226787e-07, "loss": 0.0126, "num_tokens": 296060143.0, "reward": 1.13525390625, "reward_std": 0.15051233768463135, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 499.140625, "completions/mean_terminated_length": 490.0118103027344, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2160727046430111, "epoch": 0.22772277227722773, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11452636887516321, "learning_rate": 9.562633752468744e-07, "loss": 0.0551, "num_tokens": 296394535.0, "reward": 1.12060546875, "reward_std": 0.11772677302360535, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1677.0, "completions/mean_length": 480.125, "completions/mean_terminated_length": 461.53363037109375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.20529597252607346, "epoch": 0.2280641857289177, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1511540651400859, "learning_rate": 9.560324488169387e-07, "loss": 0.0783, "num_tokens": 296713671.0, "reward": 1.1630859375, "reward_std": 0.15080787241458893, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3776407241821289, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.08079168945550919, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1647.0, "completions/mean_length": 474.224609375, "completions/mean_terminated_length": 458.70416259765625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.21772955358028412, "epoch": 0.2284055991806077, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13035718873953583, "learning_rate": 9.558009456608786e-07, "loss": 0.0709, "num_tokens": 297034154.0, "reward": 1.12060546875, "reward_std": 0.1564001888036728, "rewards/accuracy_reward/mean": 0.13104838132858276, "rewards/accuracy_reward/std": 0.3377939462661743, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.06697065383195877, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1757.0, "completions/mean_length": 492.38671875, "completions/mean_terminated_length": 470.82379150390625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.20365739986300468, "epoch": 0.2287470126322977, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12757135889089613, "learning_rate": 9.555688661075198e-07, "loss": 0.05, "num_tokens": 297368112.0, "reward": 1.20263671875, "reward_std": 0.18388621509075165, "rewards/accuracy_reward/mean": 0.212890625, "rewards/accuracy_reward/std": 0.409751296043396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98974609375, "rewards/tag_count_reward/std": 0.08717872947454453, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1609.0, "completions/mean_length": 423.18359375, "completions/mean_terminated_length": 413.6070861816406, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2044973187148571, "epoch": 0.22908842608398772, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14302073187156586, "learning_rate": 9.553362104865063e-07, "loss": 0.0329, "num_tokens": 297652782.0, "reward": 1.20849609375, "reward_std": 0.16160425543785095, "rewards/accuracy_reward/mean": 0.212890625, "rewards/accuracy_reward/std": 0.409751296043396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2047.0, "completions/max_terminated_length": 1451.0, "completions/mean_length": 495.853515625, "completions/mean_terminated_length": 489.7706298828125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.22349877282977104, "epoch": 0.2294298395356777, "frac_reward_zero_std": 0.71875, "grad_norm": 0.1108532275933992, "learning_rate": 9.551029791283014e-07, "loss": -0.0148, "num_tokens": 297984531.0, "reward": 1.060546875, "reward_std": 0.10590162128210068, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 527.833984375, "completions/mean_terminated_length": 497.5637512207031, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.2148323766887188, "epoch": 0.2297712529873677, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11696454423992962, "learning_rate": 9.548691723641851e-07, "loss": 0.0418, "num_tokens": 298332606.0, "reward": 1.1044921875, "reward_std": 0.1537870466709137, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9755859375, "rewards/tag_count_reward/std": 0.12070263922214508, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1643.0, "completions/max_terminated_length": 1643.0, "completions/mean_length": 464.24609375, "completions/mean_terminated_length": 464.24609375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2217281088232994, "epoch": 0.2301126664390577, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11828636821775462, "learning_rate": 9.546347905262556e-07, "loss": 0.004, "num_tokens": 298647132.0, "reward": 1.123046875, "reward_std": 0.10709279775619507, "rewards/accuracy_reward/mean": 0.1270161271095276, "rewards/accuracy_reward/std": 0.33332720398902893, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1735.0, "completions/mean_length": 552.185546875, "completions/mean_terminated_length": 528.4425048828125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.1954418644309044, "epoch": 0.2304540798907477, "frac_reward_zero_std": 0.5, "grad_norm": 0.13091042623593727, "learning_rate": 9.543998339474272e-07, "loss": 0.0349, "num_tokens": 299010587.0, "reward": 1.13330078125, "reward_std": 0.186738058924675, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98876953125, "rewards/tag_count_reward/std": 0.08982396870851517, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1835.0, "completions/mean_length": 594.015625, "completions/mean_terminated_length": 541.06884765625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.20305418968200684, "epoch": 0.23079549334243768, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11327639165079363, "learning_rate": 9.541643029614309e-07, "loss": 0.0303, "num_tokens": 299393811.0, "reward": 1.18115234375, "reward_std": 0.16186685860157013, "rewards/accuracy_reward/mean": 0.20703125, "rewards/accuracy_reward/std": 0.40557438135147095, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97412109375, "rewards/tag_count_reward/std": 0.13613051176071167, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1427.0, "completions/mean_length": 509.451171875, "completions/mean_terminated_length": 500.38311767578125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2219552919268608, "epoch": 0.23113690679412768, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12121598790973787, "learning_rate": 9.539281979028132e-07, "loss": 0.0393, "num_tokens": 299738154.0, "reward": 1.09326171875, "reward_std": 0.13532710075378418, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1695.0, "completions/mean_length": 508.365234375, "completions/mean_terminated_length": 502.3274841308594, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.21280357241630554, "epoch": 0.2314783202458177, "frac_reward_zero_std": 0.65625, "grad_norm": 0.16014469743625423, "learning_rate": 9.53691519106937e-07, "loss": 0.0229, "num_tokens": 300070085.0, "reward": 1.1552734375, "reward_std": 0.13031834363937378, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1358.0, "completions/mean_length": 481.009765625, "completions/mean_terminated_length": 477.9432373046875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.19989868625998497, "epoch": 0.2318197336975077, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10464648043150053, "learning_rate": 9.534542669099792e-07, "loss": 0.0121, "num_tokens": 300395114.0, "reward": 1.12451171875, "reward_std": 0.12360771745443344, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1348.0, "completions/mean_length": 460.283203125, "completions/mean_terminated_length": 454.056884765625, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.22103272378444672, "epoch": 0.23216114714919767, "frac_reward_zero_std": 0.71875, "grad_norm": 0.10398868489094414, "learning_rate": 9.532164416489314e-07, "loss": 0.0433, "num_tokens": 300716171.0, "reward": 1.1630859375, "reward_std": 0.09683827310800552, "rewards/accuracy_reward/mean": 0.166015625, "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1422.0, "completions/mean_length": 459.09765625, "completions/mean_terminated_length": 452.86669921875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.2170395702123642, "epoch": 0.23250256060088767, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12377807648261563, "learning_rate": 9.529780436615992e-07, "loss": 0.0232, "num_tokens": 301025117.0, "reward": 1.1689453125, "reward_std": 0.13440465927124023, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3776407241821289, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1304.0, "completions/max_terminated_length": 1304.0, "completions/mean_length": 462.7109375, "completions/mean_terminated_length": 462.7109375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.22110578790307045, "epoch": 0.23284397405257767, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12410168450543665, "learning_rate": 9.527390732866016e-07, "loss": -0.0137, "num_tokens": 301337817.0, "reward": 1.1484375, "reward_std": 0.1340911090373993, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 570.80078125, "completions/mean_terminated_length": 529.295166015625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.2049558088183403, "epoch": 0.23318538750426768, "frac_reward_zero_std": 0.625, "grad_norm": 0.10375858978764621, "learning_rate": 9.52499530863371e-07, "loss": 0.0279, "num_tokens": 301706691.0, "reward": 1.13525390625, "reward_std": 0.136324942111969, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97509765625, "rewards/tag_count_reward/std": 0.1289350837469101, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 495.634765625, "completions/mean_terminated_length": 492.59686279296875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.19719870388507843, "epoch": 0.23352680095595765, "frac_reward_zero_std": 0.5625, "grad_norm": 0.119068511594675, "learning_rate": 9.522594167321519e-07, "loss": 0.0112, "num_tokens": 302034680.0, "reward": 1.15283203125, "reward_std": 0.16416628658771515, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1513.0, "completions/max_terminated_length": 1513.0, "completions/mean_length": 524.109375, "completions/mean_terminated_length": 524.109375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.20824290066957474, "epoch": 0.23386821440764766, "frac_reward_zero_std": 0.625, "grad_norm": 0.10314924659888285, "learning_rate": 9.520187312340011e-07, "loss": -0.0106, "num_tokens": 302382144.0, "reward": 1.169921875, "reward_std": 0.1450715959072113, "rewards/accuracy_reward/mean": 0.169921875, "rewards/accuracy_reward/std": 0.3759314715862274, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1555.0, "completions/mean_length": 496.95703125, "completions/mean_terminated_length": 493.9217224121094, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.21254917979240417, "epoch": 0.23420962785933766, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11960930156751369, "learning_rate": 9.517774747107868e-07, "loss": 0.004, "num_tokens": 302712842.0, "reward": 1.18408203125, "reward_std": 0.1679394543170929, "rewards/accuracy_reward/mean": 0.185546875, "rewards/accuracy_reward/std": 0.38912075757980347, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1630.0, "completions/max_terminated_length": 1630.0, "completions/mean_length": 500.6328125, "completions/mean_terminated_length": 500.6328125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.19473891332745552, "epoch": 0.23455104131102766, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14221885687601146, "learning_rate": 9.515356475051884e-07, "loss": -0.0091, "num_tokens": 303048542.0, "reward": 1.140625, "reward_std": 0.1648963987827301, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1660.0, "completions/mean_length": 555.744140625, "completions/mean_terminated_length": 507.6370849609375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.24014944210648537, "epoch": 0.23489245476271764, "frac_reward_zero_std": 0.625, "grad_norm": 0.11988018822161635, "learning_rate": 9.512932499606957e-07, "loss": -0.012, "num_tokens": 303421787.0, "reward": 1.07958984375, "reward_std": 0.13283556699752808, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97607421875, "rewards/tag_count_reward/std": 0.13006402552127838, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1772.0, "completions/mean_length": 489.341796875, "completions/mean_terminated_length": 486.2915954589844, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.22404289618134499, "epoch": 0.23523386821440764, "frac_reward_zero_std": 0.78125, "grad_norm": 0.09955370001267191, "learning_rate": 9.510502824216091e-07, "loss": 0.0047, "num_tokens": 303752314.0, "reward": 1.05322265625, "reward_std": 0.07821530848741531, "rewards/accuracy_reward/mean": 0.05645161122083664, "rewards/accuracy_reward/std": 0.23102475702762604, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1190.0, "completions/mean_length": 442.8515625, "completions/mean_terminated_length": 439.7103576660156, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.2448342852294445, "epoch": 0.23557528166609765, "frac_reward_zero_std": 0.625, "grad_norm": 0.1403544864158117, "learning_rate": 9.50806745233038e-07, "loss": 0.0255, "num_tokens": 304058366.0, "reward": 1.13330078125, "reward_std": 0.13734932243824005, "rewards/accuracy_reward/mean": 0.1391129046678543, "rewards/accuracy_reward/std": 0.34641367197036743, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1787.0, "completions/mean_length": 520.041015625, "completions/mean_terminated_length": 511.0353698730469, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.22016974166035652, "epoch": 0.23591669511778765, "frac_reward_zero_std": 0.5, "grad_norm": 0.18949716550529563, "learning_rate": 9.505626387409013e-07, "loss": 0.015, "num_tokens": 304401219.0, "reward": 1.14599609375, "reward_std": 0.20119234919548035, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2047.0, "completions/max_terminated_length": 1950.0, "completions/mean_length": 507.859375, "completions/mean_terminated_length": 470.9200134277344, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.2259349450469017, "epoch": 0.23625810856947763, "frac_reward_zero_std": 0.625, "grad_norm": 0.12544376992407155, "learning_rate": 9.503179632919265e-07, "loss": 0.0034, "num_tokens": 304742043.0, "reward": 1.095703125, "reward_std": 0.13141858577728271, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.978515625, "rewards/tag_count_reward/std": 0.11922509223222733, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2005.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 531.650390625, "completions/mean_terminated_length": 531.650390625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.2132948413491249, "epoch": 0.23659952202116763, "frac_reward_zero_std": 0.5, "grad_norm": 0.137598886170009, "learning_rate": 9.500727192336488e-07, "loss": 0.0058, "num_tokens": 305083928.0, "reward": 1.2099609375, "reward_std": 0.18419361114501953, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4083731174468994, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.022097086533904076, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1813.0, "completions/mean_length": 586.90234375, "completions/mean_terminated_length": 584.0430297851562, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.16534538194537163, "epoch": 0.23694093547285763, "frac_reward_zero_std": 0.625, "grad_norm": 0.09486928576032724, "learning_rate": 9.49826906914412e-07, "loss": 0.0104, "num_tokens": 305456934.0, "reward": 1.13623046875, "reward_std": 0.13961058855056763, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.039800092577934265, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1742.0, "completions/mean_length": 483.314453125, "completions/mean_terminated_length": 464.7608947753906, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.21232197806239128, "epoch": 0.23728234892454764, "frac_reward_zero_std": 0.5, "grad_norm": 0.153852590450085, "learning_rate": 9.495805266833661e-07, "loss": 0.0463, "num_tokens": 305777991.0, "reward": 1.21875, "reward_std": 0.1703641265630722, "rewards/accuracy_reward/mean": 0.228515625, "rewards/accuracy_reward/std": 0.4202871024608612, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.990234375, "rewards/tag_count_reward/std": 0.08365631848573685, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1720.0, "completions/mean_length": 482.6015625, "completions/mean_terminated_length": 473.3752746582031, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.21240396052598953, "epoch": 0.2376237623762376, "frac_reward_zero_std": 0.40625, "grad_norm": 0.16879412776053593, "learning_rate": 9.493335788904683e-07, "loss": 0.0306, "num_tokens": 306104523.0, "reward": 1.21630859375, "reward_std": 0.24464884400367737, "rewards/accuracy_reward/mean": 0.220703125, "rewards/accuracy_reward/std": 0.4151262938976288, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 525.689453125, "completions/mean_terminated_length": 513.7027587890625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.2043604589998722, "epoch": 0.23796517582792762, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1124876619959216, "learning_rate": 9.490860638864818e-07, "loss": 0.0458, "num_tokens": 306449836.0, "reward": 1.13671875, "reward_std": 0.11315509676933289, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1717.0, "completions/mean_length": 546.904296875, "completions/mean_terminated_length": 529.104736328125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.19008737802505493, "epoch": 0.23830658927961762, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11238422228350262, "learning_rate": 9.488379820229755e-07, "loss": 0.0948, "num_tokens": 306804235.0, "reward": 1.1015625, "reward_std": 0.1374732106924057, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.990234375, "rewards/tag_count_reward/std": 0.08365631848573685, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1762.0, "completions/mean_length": 547.0546875, "completions/mean_terminated_length": 544.117431640625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.20867148414254189, "epoch": 0.23864800273130762, "frac_reward_zero_std": 0.625, "grad_norm": 0.1304119484876727, "learning_rate": 9.485893336523233e-07, "loss": 0.0139, "num_tokens": 307156487.0, "reward": 1.13525390625, "reward_std": 0.14529694616794586, "rewards/accuracy_reward/mean": 0.1411290317773819, "rewards/accuracy_reward/std": 0.3485061228275299, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1582.0, "completions/mean_length": 474.24609375, "completions/mean_terminated_length": 464.9705505371094, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.21508710458874702, "epoch": 0.2389894161829976, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12700590808592305, "learning_rate": 9.483401191277038e-07, "loss": 0.041, "num_tokens": 307469941.0, "reward": 1.1708984375, "reward_std": 0.1847928762435913, "rewards/accuracy_reward/mean": 0.17578125, "rewards/accuracy_reward/std": 0.3810062110424042, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1677.0, "completions/mean_length": 522.5859375, "completions/mean_terminated_length": 510.5747985839844, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.20845911279320717, "epoch": 0.2393308296346876, "frac_reward_zero_std": 0.5, "grad_norm": 0.1381535904610966, "learning_rate": 9.480903388031002e-07, "loss": 0.0609, "num_tokens": 307816529.0, "reward": 1.18603515625, "reward_std": 0.2000790387392044, "rewards/accuracy_reward/mean": 0.19140625, "rewards/accuracy_reward/std": 0.3937928080558777, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.059313252568244934, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1539.0, "completions/mean_length": 501.5234375, "completions/mean_terminated_length": 495.4588623046875, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.22851786389946938, "epoch": 0.2396722430863776, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1193626170477473, "learning_rate": 9.478399930332987e-07, "loss": 0.0099, "num_tokens": 308156221.0, "reward": 1.1318359375, "reward_std": 0.14061586558818817, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1905.0, "completions/mean_length": 563.447265625, "completions/mean_terminated_length": 551.7578735351562, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.18719759210944176, "epoch": 0.2400136565380676, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10208441762390126, "learning_rate": 9.475890821738894e-07, "loss": 0.0506, "num_tokens": 308513442.0, "reward": 1.099609375, "reward_std": 0.11833953857421875, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 575.251953125, "completions/mean_terminated_length": 572.369873046875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.2078980654478073, "epoch": 0.24035506998975759, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11409655173288619, "learning_rate": 9.47337606581264e-07, "loss": -0.0109, "num_tokens": 308883651.0, "reward": 1.1416015625, "reward_std": 0.16354680061340332, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 563.65234375, "completions/mean_terminated_length": 543.0811767578125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.17728624865412712, "epoch": 0.2406964834414476, "frac_reward_zero_std": 0.4375, "grad_norm": 0.16255043042413642, "learning_rate": 9.470855666126176e-07, "loss": 0.0569, "num_tokens": 309246881.0, "reward": 1.28173828125, "reward_std": 0.20222993195056915, "rewards/accuracy_reward/mean": 0.3145161271095276, "rewards/accuracy_reward/std": 0.4647916853427887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97705078125, "rewards/tag_count_reward/std": 0.11423057317733765, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 546.091796875, "completions/mean_terminated_length": 528.2826538085938, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.20216145738959312, "epoch": 0.2410378968931376, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10321195285634178, "learning_rate": 9.468329626259459e-07, "loss": 0.038, "num_tokens": 309607360.0, "reward": 1.21875, "reward_std": 0.11559019237756729, "rewards/accuracy_reward/mean": 0.2265625, "rewards/accuracy_reward/std": 0.4190165400505066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.07459938526153564, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2002.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 562.25390625, "completions/mean_terminated_length": 562.25390625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.20201657339930534, "epoch": 0.2413793103448276, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1215021937073887, "learning_rate": 9.465797949800462e-07, "loss": 0.0079, "num_tokens": 309977570.0, "reward": 1.146484375, "reward_std": 0.1539159119129181, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 572.154296875, "completions/mean_terminated_length": 557.599609375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.22550709173083305, "epoch": 0.24172072379651757, "frac_reward_zero_std": 0.625, "grad_norm": 0.1245338451402539, "learning_rate": 9.463260640345164e-07, "loss": 0.0274, "num_tokens": 310351697.0, "reward": 1.07861328125, "reward_std": 0.13866841793060303, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1942.0, "completions/mean_length": 584.919921875, "completions/mean_terminated_length": 576.2966918945312, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.22151029855012894, "epoch": 0.24206213724820758, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12432257714216542, "learning_rate": 9.460717701497546e-07, "loss": -0.0125, "num_tokens": 310720408.0, "reward": 1.1162109375, "reward_std": 0.17278194427490234, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1706.0, "completions/mean_length": 540.697265625, "completions/mean_terminated_length": 534.7863159179688, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.2125404141843319, "epoch": 0.24240355069989758, "frac_reward_zero_std": 0.65625, "grad_norm": 0.14341324334045533, "learning_rate": 9.458169136869581e-07, "loss": 0.0437, "num_tokens": 311068429.0, "reward": 1.1494140625, "reward_std": 0.11553068459033966, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 652.318359375, "completions/mean_terminated_length": 613.0823364257812, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.16786815598607063, "epoch": 0.24274496415158758, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12330305730790725, "learning_rate": 9.455614950081236e-07, "loss": 0.0347, "num_tokens": 311478992.0, "reward": 1.2294921875, "reward_std": 0.16523955762386322, "rewards/accuracy_reward/mean": 0.25806450843811035, "rewards/accuracy_reward/std": 0.43801143765449524, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9794921875, "rewards/tag_count_reward/std": 0.12243188172578812, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 570.48828125, "completions/mean_terminated_length": 561.7799682617188, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.2152835950255394, "epoch": 0.24308637760327756, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12517457167178417, "learning_rate": 9.453055144760462e-07, "loss": 0.0081, "num_tokens": 311846634.0, "reward": 1.205078125, "reward_std": 0.17599722743034363, "rewards/accuracy_reward/mean": 0.208984375, "rewards/accuracy_reward/std": 0.40698084235191345, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.051725368946790695, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 585.140625, "completions/mean_terminated_length": 561.920654296875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.20879288017749786, "epoch": 0.24342779105496756, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12295323824251671, "learning_rate": 9.450489724543195e-07, "loss": 0.0344, "num_tokens": 312220690.0, "reward": 1.07421875, "reward_std": 0.1318282186985016, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.09310565888881683, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1707.0, "completions/mean_length": 587.8359375, "completions/mean_terminated_length": 570.5217895507812, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.2350471019744873, "epoch": 0.24376920450665757, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13180643612047327, "learning_rate": 9.447918693073339e-07, "loss": 0.0422, "num_tokens": 312598718.0, "reward": 1.1025390625, "reward_std": 0.1273098886013031, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.08079168945550919, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1838.0, "completions/mean_length": 589.98828125, "completions/mean_terminated_length": 572.6996459960938, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.21184594556689262, "epoch": 0.24411061795834757, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10869993715648923, "learning_rate": 9.445342054002775e-07, "loss": 0.0399, "num_tokens": 312971256.0, "reward": 1.1220703125, "reward_std": 0.1330697238445282, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.07770495861768723, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 605.74609375, "completions/mean_terminated_length": 597.24560546875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.20561859756708145, "epoch": 0.24445203141003755, "frac_reward_zero_std": 0.625, "grad_norm": 0.11022232657483554, "learning_rate": 9.442759810991345e-07, "loss": 0.0235, "num_tokens": 313359462.0, "reward": 1.05615234375, "reward_std": 0.1293564736843109, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24230584502220154, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1948.0, "completions/mean_length": 612.3984375, "completions/mean_terminated_length": 603.9371337890625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.19868680089712143, "epoch": 0.24479344486172755, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1039612028315537, "learning_rate": 9.440171967706852e-07, "loss": 0.022, "num_tokens": 313746850.0, "reward": 1.15380859375, "reward_std": 0.15985330939292908, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1943.0, "completions/mean_length": 575.123046875, "completions/mean_terminated_length": 569.3471069335938, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.21504658088088036, "epoch": 0.24513485831341755, "frac_reward_zero_std": 0.5, "grad_norm": 0.12747844473377068, "learning_rate": 9.437578527825055e-07, "loss": 0.005, "num_tokens": 314116081.0, "reward": 1.2021484375, "reward_std": 0.1898803412914276, "rewards/accuracy_reward/mean": 0.205078125, "rewards/accuracy_reward/std": 0.4041535556316376, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1930.0, "completions/mean_length": 643.279296875, "completions/mean_terminated_length": 609.5660400390625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2005002647638321, "epoch": 0.24547627176510756, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12036169788944916, "learning_rate": 9.434979495029658e-07, "loss": 0.1048, "num_tokens": 314522144.0, "reward": 1.1220703125, "reward_std": 0.15352237224578857, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9833984375, "rewards/tag_count_reward/std": 0.10933786630630493, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1860.0, "completions/mean_length": 600.3671875, "completions/mean_terminated_length": 591.8350219726562, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.18230075761675835, "epoch": 0.24581768521679753, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10838130268715031, "learning_rate": 9.432374873012313e-07, "loss": 0.0337, "num_tokens": 314899484.0, "reward": 1.1318359375, "reward_std": 0.1263992190361023, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1819.0, "completions/mean_length": 634.052734375, "completions/mean_terminated_length": 628.5078735351562, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.19637395441532135, "epoch": 0.24615909866848754, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11683207824939286, "learning_rate": 9.429764665472612e-07, "loss": 0.0015, "num_tokens": 315294807.0, "reward": 1.2294921875, "reward_std": 0.2169683575630188, "rewards/accuracy_reward/mean": 0.232421875, "rewards/accuracy_reward/std": 0.42278963327407837, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 591.431640625, "completions/mean_terminated_length": 588.5812377929688, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.2236880585551262, "epoch": 0.24650051212017754, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09798633394184418, "learning_rate": 9.427148876118077e-07, "loss": -0.0131, "num_tokens": 315664612.0, "reward": 1.07470703125, "reward_std": 0.10791425406932831, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1800.0, "completions/mean_length": 611.408203125, "completions/mean_terminated_length": 602.9411010742188, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.21874397993087769, "epoch": 0.24684192557186754, "frac_reward_zero_std": 0.71875, "grad_norm": 0.12217899313342948, "learning_rate": 9.424527508664157e-07, "loss": 0.0393, "num_tokens": 316051413.0, "reward": 1.10107421875, "reward_std": 0.10465440154075623, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 660.67578125, "completions/mean_terminated_length": 655.2353515625, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.19907107576727867, "epoch": 0.24718333902355752, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12659553936321555, "learning_rate": 9.421900566834233e-07, "loss": -0.0035, "num_tokens": 316471103.0, "reward": 1.1630859375, "reward_std": 0.2078220546245575, "rewards/accuracy_reward/mean": 0.166015625, "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1731.0, "completions/mean_length": 612.48828125, "completions/mean_terminated_length": 598.3313598632812, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.20461948961019516, "epoch": 0.24752475247524752, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09352734630559963, "learning_rate": 9.419268054359594e-07, "loss": 0.019, "num_tokens": 316860713.0, "reward": 1.13330078125, "reward_std": 0.14307375252246857, "rewards/accuracy_reward/mean": 0.14516128599643707, "rewards/accuracy_reward/std": 0.3526190221309662, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 624.490234375, "completions/mean_terminated_length": 601.8948974609375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.20234686136245728, "epoch": 0.24786616592693753, "frac_reward_zero_std": 0.625, "grad_norm": 0.10412920751231565, "learning_rate": 9.416629974979448e-07, "loss": 0.031, "num_tokens": 317262804.0, "reward": 1.08984375, "reward_std": 0.14604413509368896, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.09310565888881683, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2047.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 590.537109375, "completions/mean_terminated_length": 573.266845703125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.21518559753894806, "epoch": 0.24820757937862753, "frac_reward_zero_std": 0.84375, "grad_norm": 0.08100254861550638, "learning_rate": 9.413986332440903e-07, "loss": 0.0071, "num_tokens": 317647351.0, "reward": 1.01904296875, "reward_std": 0.05151323974132538, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98193359375, "rewards/tag_count_reward/std": 0.10275448858737946, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1895.0, "completions/mean_length": 598.3046875, "completions/mean_terminated_length": 584.0078735351562, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.22069146111607552, "epoch": 0.2485489928303175, "frac_reward_zero_std": 0.625, "grad_norm": 0.10784128924508857, "learning_rate": 9.411337130498977e-07, "loss": 0.031, "num_tokens": 318030851.0, "reward": 1.07666015625, "reward_std": 0.13983051478862762, "rewards/accuracy_reward/mean": 0.08669354766607285, "rewards/accuracy_reward/std": 0.281669557094574, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1841.0, "completions/mean_length": 669.775390625, "completions/mean_terminated_length": 661.6522827148438, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.18457714840769768, "epoch": 0.2488904062820075, "frac_reward_zero_std": 0.75, "grad_norm": 0.07521897101221905, "learning_rate": 9.408682372916582e-07, "loss": 0.0207, "num_tokens": 318449296.0, "reward": 1.18701171875, "reward_std": 0.08057050406932831, "rewards/accuracy_reward/mean": 0.19140625, "rewards/accuracy_reward/std": 0.3937928080558777, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1667.0, "completions/mean_length": 613.4609375, "completions/mean_terminated_length": 607.8353271484375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.21098411828279495, "epoch": 0.2492318197336975, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10913485643685432, "learning_rate": 9.406022063464516e-07, "loss": 0.0282, "num_tokens": 318835372.0, "reward": 1.2158203125, "reward_std": 0.15248551964759827, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41380295157432556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1621.0, "completions/mean_length": 640.26171875, "completions/mean_terminated_length": 623.5692138671875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.2002451978623867, "epoch": 0.24957323318538752, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09116764794212656, "learning_rate": 9.403356205921471e-07, "loss": 0.0525, "num_tokens": 319241922.0, "reward": 1.1162109375, "reward_std": 0.13734324276447296, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.08079168945550919, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1950.0, "completions/mean_length": 603.986328125, "completions/mean_terminated_length": 583.9703369140625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.19930466264486313, "epoch": 0.2499146466370775, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11842572971096328, "learning_rate": 9.400684804074015e-07, "loss": 0.0869, "num_tokens": 319623259.0, "reward": 1.11474609375, "reward_std": 0.13092736899852753, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98974609375, "rewards/tag_count_reward/std": 0.08717872947454453, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1861.0, "completions/mean_length": 652.06640625, "completions/mean_terminated_length": 624.2589721679688, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.20755723491311073, "epoch": 0.2502560600887675, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1369890960969456, "learning_rate": 9.398007861716589e-07, "loss": -0.0008, "num_tokens": 320042461.0, "reward": 1.11962890625, "reward_std": 0.19919615983963013, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98486328125, "rewards/tag_count_reward/std": 0.10440578311681747, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1718.0, "completions/mean_length": 625.611328125, "completions/mean_terminated_length": 617.2279052734375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.20290705934166908, "epoch": 0.25059747354045747, "frac_reward_zero_std": 0.78125, "grad_norm": 0.09619488731197232, "learning_rate": 9.39532538265151e-07, "loss": 0.0349, "num_tokens": 320444742.0, "reward": 1.0556640625, "reward_std": 0.06698599457740784, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1815.0, "completions/max_terminated_length": 1815.0, "completions/mean_length": 585.181640625, "completions/mean_terminated_length": 585.181640625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.19780207052826881, "epoch": 0.2509388869921475, "frac_reward_zero_std": 0.625, "grad_norm": 0.11036461907948214, "learning_rate": 9.392637370688951e-07, "loss": -0.0042, "num_tokens": 320822483.0, "reward": 1.166015625, "reward_std": 0.14947746694087982, "rewards/accuracy_reward/mean": 0.166015625, "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 622.296875, "completions/mean_terminated_length": 608.2366943359375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.18526850640773773, "epoch": 0.2512803004438375, "frac_reward_zero_std": 0.625, "grad_norm": 0.1015315294410914, "learning_rate": 9.389943829646953e-07, "loss": 0.0343, "num_tokens": 321219227.0, "reward": 1.12939453125, "reward_std": 0.1405368596315384, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 564.46875, "completions/mean_terminated_length": 552.7874145507812, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.21139715984463692, "epoch": 0.2516217138955275, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1240660116519022, "learning_rate": 9.387244763351403e-07, "loss": 0.0424, "num_tokens": 321578283.0, "reward": 1.205078125, "reward_std": 0.12911957502365112, "rewards/accuracy_reward/mean": 0.2177419364452362, "rewards/accuracy_reward/std": 0.41312772035598755, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 628.0, "completions/mean_terminated_length": 611.162109375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.18864763528108597, "epoch": 0.2519631273472175, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10608271206404393, "learning_rate": 9.384540175636042e-07, "loss": 0.0585, "num_tokens": 321972763.0, "reward": 1.1572265625, "reward_std": 0.13876962661743164, "rewards/accuracy_reward/mean": 0.166015625, "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.08079168945550919, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1871.0, "completions/mean_length": 574.662109375, "completions/mean_terminated_length": 563.06103515625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 0.21298912912607193, "epoch": 0.25230454079890746, "frac_reward_zero_std": 0.5, "grad_norm": 0.14486083699927355, "learning_rate": 9.381830070342446e-07, "loss": 0.0211, "num_tokens": 322342030.0, "reward": 1.27001953125, "reward_std": 0.20171679556369781, "rewards/accuracy_reward/mean": 0.275390625, "rewards/accuracy_reward/std": 0.44714778661727905, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.06134068965911865, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 584.673828125, "completions/mean_terminated_length": 573.1515502929688, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.20591432228684425, "epoch": 0.2526459542505975, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12001970773156569, "learning_rate": 9.379114451320039e-07, "loss": 0.0292, "num_tokens": 322718839.0, "reward": 1.2138671875, "reward_std": 0.169985830783844, "rewards/accuracy_reward/mean": 0.220703125, "rewards/accuracy_reward/std": 0.4151262938976288, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06960996240377426, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1920.0, "completions/mean_length": 627.734375, "completions/mean_terminated_length": 619.3634643554688, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.20296473801136017, "epoch": 0.25298736770228747, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1249283502261158, "learning_rate": 9.376393322426065e-07, "loss": 0.0192, "num_tokens": 323115631.0, "reward": 1.17919921875, "reward_std": 0.1594557762145996, "rewards/accuracy_reward/mean": 0.18359375, "rewards/accuracy_reward/std": 0.3875311613082886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 613.05078125, "completions/mean_terminated_length": 593.160400390625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.21802276372909546, "epoch": 0.25332878115397744, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13522019251535886, "learning_rate": 9.373666687525603e-07, "loss": 0.0818, "num_tokens": 323514393.0, "reward": 1.12255859375, "reward_std": 0.13890141248703003, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98974609375, "rewards/tag_count_reward/std": 0.08717872947454453, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1894.0, "completions/mean_length": 609.705078125, "completions/mean_terminated_length": 589.768310546875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.1910315677523613, "epoch": 0.2536701946056675, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11123523890332564, "learning_rate": 9.370934550491547e-07, "loss": 0.0715, "num_tokens": 323905202.0, "reward": 1.08740234375, "reward_std": 0.13301792740821838, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98974609375, "rewards/tag_count_reward/std": 0.08717872947454453, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1910.0, "completions/mean_length": 660.546875, "completions/mean_terminated_length": 652.369384765625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.19063181802630424, "epoch": 0.25401160805735745, "frac_reward_zero_std": 0.5, "grad_norm": 0.1152378660836529, "learning_rate": 9.36819691520461e-07, "loss": -0.0035, "num_tokens": 324316394.0, "reward": 1.1220703125, "reward_std": 0.1727963089942932, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1723.0, "completions/mean_length": 609.453125, "completions/mean_terminated_length": 595.2662963867188, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.20241661369800568, "epoch": 0.2543530215090475, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12297460760619926, "learning_rate": 9.365453785553313e-07, "loss": 0.0566, "num_tokens": 324703634.0, "reward": 1.18798828125, "reward_std": 0.14695346355438232, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3968288004398346, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1878.0, "completions/mean_length": 575.8515625, "completions/mean_terminated_length": 567.1748657226562, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.22432704269886017, "epoch": 0.25469443496073746, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12888136091603897, "learning_rate": 9.36270516543398e-07, "loss": 0.0269, "num_tokens": 325075910.0, "reward": 1.19482421875, "reward_std": 0.1734844148159027, "rewards/accuracy_reward/mean": 0.19921875, "rewards/accuracy_reward/std": 0.39980348944664, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1969.0, "completions/mean_length": 551.1484375, "completions/mean_terminated_length": 545.2784423828125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.23196454346179962, "epoch": 0.25503584841242743, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13763405297657214, "learning_rate": 9.359951058750738e-07, "loss": -0.0024, "num_tokens": 325439826.0, "reward": 1.1572265625, "reward_std": 0.17009927332401276, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1800.0, "completions/mean_length": 637.234375, "completions/mean_terminated_length": 631.7020263671875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.17925050482153893, "epoch": 0.25537726186411747, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09748595151393874, "learning_rate": 9.357191469415501e-07, "loss": 0.0277, "num_tokens": 325839866.0, "reward": 1.16064453125, "reward_std": 0.11033955216407776, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 597.375, "completions/mean_terminated_length": 591.686279296875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2130432091653347, "epoch": 0.25571867531580744, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1288771206720774, "learning_rate": 9.354426401347974e-07, "loss": 0.016, "num_tokens": 326231530.0, "reward": 1.0615234375, "reward_std": 0.12095179408788681, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.041276250034570694, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1682.0, "completions/mean_length": 538.123046875, "completions/mean_terminated_length": 529.2239990234375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.2095038704574108, "epoch": 0.2560600887674974, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1274833288233543, "learning_rate": 9.351655858475646e-07, "loss": 0.0347, "num_tokens": 326582361.0, "reward": 1.1123046875, "reward_std": 0.12121747434139252, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1697.0, "completions/mean_length": 536.244140625, "completions/mean_terminated_length": 533.2857055664062, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.23268677294254303, "epoch": 0.25640150221918745, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13400927249972136, "learning_rate": 9.348879844733779e-07, "loss": 0.0285, "num_tokens": 326936326.0, "reward": 1.18994140625, "reward_std": 0.15176761150360107, "rewards/accuracy_reward/mean": 0.193359375, "rewards/accuracy_reward/std": 0.39531853795051575, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.045470330864191055, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 583.26171875, "completions/mean_terminated_length": 562.9584350585938, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.18879177048802376, "epoch": 0.2567429156708774, "frac_reward_zero_std": 0.625, "grad_norm": 0.1291265142106832, "learning_rate": 9.346098364065405e-07, "loss": 0.0347, "num_tokens": 327315004.0, "reward": 1.08447265625, "reward_std": 0.13740262389183044, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98876953125, "rewards/tag_count_reward/std": 0.08982396870851517, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1889.0, "completions/mean_length": 614.529296875, "completions/mean_terminated_length": 580.1260375976562, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.2021884098649025, "epoch": 0.25708432912256746, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14628760508280444, "learning_rate": 9.343311420421323e-07, "loss": 0.0671, "num_tokens": 327706763.0, "reward": 1.08984375, "reward_std": 0.14109767973423004, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.982421875, "rewards/tag_count_reward/std": 0.11357727646827698, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1669.0, "completions/mean_length": 608.423828125, "completions/mean_terminated_length": 599.9391479492188, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.19734089821577072, "epoch": 0.25742574257425743, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10877708074861657, "learning_rate": 9.340519017760093e-07, "loss": 0.0242, "num_tokens": 328098628.0, "reward": 1.15771484375, "reward_std": 0.15767592191696167, "rewards/accuracy_reward/mean": 0.162109375, "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1876.0, "completions/mean_length": 585.697265625, "completions/mean_terminated_length": 565.427734375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.19293183833360672, "epoch": 0.2577671560259474, "frac_reward_zero_std": 0.5, "grad_norm": 0.13551791249458434, "learning_rate": 9.337721160048028e-07, "loss": 0.0902, "num_tokens": 328471929.0, "reward": 1.16650390625, "reward_std": 0.17056946456432343, "rewards/accuracy_reward/mean": 0.17578125, "rewards/accuracy_reward/std": 0.3810062110424042, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99072265625, "rewards/tag_count_reward/std": 0.0814909115433693, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1906.0, "completions/mean_length": 532.478515625, "completions/mean_terminated_length": 529.5126953125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.24834315106272697, "epoch": 0.25810856947763744, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14450798277105764, "learning_rate": 9.334917851259187e-07, "loss": 0.0086, "num_tokens": 328820606.0, "reward": 1.16259765625, "reward_std": 0.1605379283428192, "rewards/accuracy_reward/mean": 0.16935484111309052, "rewards/accuracy_reward/std": 0.3754436671733856, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1764.0, "completions/mean_length": 613.05859375, "completions/mean_terminated_length": 598.9072875976562, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.1770991086959839, "epoch": 0.2584499829293274, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11546032933469669, "learning_rate": 9.332109095375376e-07, "loss": 0.0768, "num_tokens": 329213100.0, "reward": 1.099609375, "reward_std": 0.17800644040107727, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.07459938526153564, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1733.0, "completions/mean_length": 660.779296875, "completions/mean_terminated_length": 635.958251953125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.17962585762143135, "epoch": 0.2587913963810174, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13445051293997895, "learning_rate": 9.329294896386131e-07, "loss": 0.0603, "num_tokens": 329631035.0, "reward": 1.14306640625, "reward_std": 0.20578958094120026, "rewards/accuracy_reward/mean": 0.16129031777381897, "rewards/accuracy_reward/std": 0.3681698739528656, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98681640625, "rewards/tag_count_reward/std": 0.09865544736385345, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1514.0, "completions/mean_length": 524.556640625, "completions/mean_terminated_length": 515.57763671875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.22251640632748604, "epoch": 0.2591328098327074, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10486389846271388, "learning_rate": 9.326475258288729e-07, "loss": 0.0456, "num_tokens": 329972408.0, "reward": 1.13232421875, "reward_std": 0.12553326785564423, "rewards/accuracy_reward/mean": 0.1411290317773819, "rewards/accuracy_reward/std": 0.3485061228275299, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1479.0, "completions/mean_length": 606.609375, "completions/mean_terminated_length": 595.2598266601562, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.22303031384944916, "epoch": 0.2594742232843974, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11833347603126364, "learning_rate": 9.323650185088164e-07, "loss": 0.0384, "num_tokens": 330363392.0, "reward": 1.1845703125, "reward_std": 0.1859341859817505, "rewards/accuracy_reward/mean": 0.189453125, "rewards/accuracy_reward/std": 0.3922513723373413, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1938.0, "completions/mean_length": 647.822265625, "completions/mean_terminated_length": 636.7972412109375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.19687655940651894, "epoch": 0.2598156367360874, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09781561851731481, "learning_rate": 9.320819680797154e-07, "loss": 0.0494, "num_tokens": 330771173.0, "reward": 1.150390625, "reward_std": 0.11786006391048431, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.36344730854034424, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1547.0, "completions/mean_length": 548.0859375, "completions/mean_terminated_length": 542.2039794921875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.20319147780537605, "epoch": 0.2601570501877774, "frac_reward_zero_std": 0.625, "grad_norm": 0.11721217547247442, "learning_rate": 9.317983749436133e-07, "loss": 0.0042, "num_tokens": 331125425.0, "reward": 1.11376953125, "reward_std": 0.14323845505714417, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1748.0, "completions/mean_length": 523.90234375, "completions/mean_terminated_length": 517.925537109375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.22499935701489449, "epoch": 0.2604984636394674, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11325345928250369, "learning_rate": 9.315142395033239e-07, "loss": 0.0335, "num_tokens": 331474127.0, "reward": 1.1552734375, "reward_std": 0.1248479038476944, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1540.0, "completions/mean_length": 582.224609375, "completions/mean_terminated_length": 570.6830444335938, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.21305035054683685, "epoch": 0.2608398770911574, "frac_reward_zero_std": 0.75, "grad_norm": 0.08240239158623382, "learning_rate": 9.312295621624317e-07, "loss": 0.05, "num_tokens": 331843586.0, "reward": 1.087890625, "reward_std": 0.07772675156593323, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 604.615234375, "completions/mean_terminated_length": 593.25, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.1979687437415123, "epoch": 0.2611812905428474, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08848370213596617, "learning_rate": 9.309443433252904e-07, "loss": 0.0278, "num_tokens": 332224749.0, "reward": 1.060546875, "reward_std": 0.08902786672115326, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 635.10546875, "completions/mean_terminated_length": 571.6959228515625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.22426877543330193, "epoch": 0.26152270399453736, "frac_reward_zero_std": 0.34375, "grad_norm": 0.1599894941812199, "learning_rate": 9.306585833970237e-07, "loss": 0.091, "num_tokens": 332633875.0, "reward": 1.205078125, "reward_std": 0.22229601442813873, "rewards/accuracy_reward/mean": 0.240234375, "rewards/accuracy_reward/std": 0.4276435375213623, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1539892703294754, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1549.0, "completions/mean_length": 573.841796875, "completions/mean_terminated_length": 565.1532592773438, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.19470980390906334, "epoch": 0.2618641174462274, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10715850203425759, "learning_rate": 9.303722827835229e-07, "loss": 0.0158, "num_tokens": 333004770.0, "reward": 1.19287109375, "reward_std": 0.15480029582977295, "rewards/accuracy_reward/mean": 0.197265625, "rewards/accuracy_reward/std": 0.3983237147331238, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1825.0, "completions/max_terminated_length": 1825.0, "completions/mean_length": 605.828125, "completions/mean_terminated_length": 605.828125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.1930515430867672, "epoch": 0.26220553089791737, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10365981539564102, "learning_rate": 9.30085441891448e-07, "loss": 0.0233, "num_tokens": 333390458.0, "reward": 1.16357421875, "reward_std": 0.15990179777145386, "rewards/accuracy_reward/mean": 0.16935484111309052, "rewards/accuracy_reward/std": 0.375443696975708, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1753.0, "completions/mean_length": 581.248046875, "completions/mean_terminated_length": 560.9168701171875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.1907409504055977, "epoch": 0.26254694434960735, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09281972192801574, "learning_rate": 9.297980611282259e-07, "loss": 0.0759, "num_tokens": 333757897.0, "reward": 1.08056640625, "reward_std": 0.09169255197048187, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99072265625, "rewards/tag_count_reward/std": 0.0814909115433693, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1941.0, "completions/mean_length": 633.107421875, "completions/mean_terminated_length": 621.966552734375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.20285219326615334, "epoch": 0.2628883578012974, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11654004183352308, "learning_rate": 9.295101409020507e-07, "loss": 0.018, "num_tokens": 334163648.0, "reward": 1.13671875, "reward_std": 0.1420758068561554, "rewards/accuracy_reward/mean": 0.14717741310596466, "rewards/accuracy_reward/std": 0.3546403646469116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 614.376953125, "completions/mean_terminated_length": 603.090576171875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.1655271090567112, "epoch": 0.26322977125298735, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10591947331096958, "learning_rate": 9.292216816218826e-07, "loss": 0.0292, "num_tokens": 334562353.0, "reward": 1.16552734375, "reward_std": 0.15005962550640106, "rewards/accuracy_reward/mean": 0.185546875, "rewards/accuracy_reward/std": 0.38912075757980347, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97998046875, "rewards/tag_count_reward/std": 0.10357881337404251, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1659.0, "completions/mean_length": 568.17578125, "completions/mean_terminated_length": 553.5818481445312, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.20361803099513054, "epoch": 0.2635711847046774, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10313555885644723, "learning_rate": 9.289326836974474e-07, "loss": 0.0526, "num_tokens": 334926059.0, "reward": 1.06103515625, "reward_std": 0.12241451442241669, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 605.4375, "completions/mean_terminated_length": 596.9351806640625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.21007542684674263, "epoch": 0.26391259815636736, "frac_reward_zero_std": 0.625, "grad_norm": 0.10313903781173721, "learning_rate": 9.286431475392363e-07, "loss": 0.0264, "num_tokens": 335310651.0, "reward": 1.05615234375, "reward_std": 0.13143262267112732, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1820.0, "completions/mean_length": 536.484375, "completions/mean_terminated_length": 533.5264282226562, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.21075664460659027, "epoch": 0.26425401160805734, "frac_reward_zero_std": 0.5, "grad_norm": 0.13808429012164294, "learning_rate": 9.283530735585044e-07, "loss": -0.0022, "num_tokens": 335663299.0, "reward": 1.23681640625, "reward_std": 0.20534053444862366, "rewards/accuracy_reward/mean": 0.23828125, "rewards/accuracy_reward/std": 0.42644867300987244, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1840.0, "completions/max_terminated_length": 1840.0, "completions/mean_length": 559.35546875, "completions/mean_terminated_length": 559.35546875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.20663504302501678, "epoch": 0.26459542505974737, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13062179903204912, "learning_rate": 9.280624621672716e-07, "loss": 0.0011, "num_tokens": 336021033.0, "reward": 1.21875, "reward_std": 0.1464497447013855, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41380295157432556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 577.0703125, "completions/mean_terminated_length": 571.302001953125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.19144031777977943, "epoch": 0.26493683851143734, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10719583501728476, "learning_rate": 9.2777131377832e-07, "loss": 0.0225, "num_tokens": 336391197.0, "reward": 1.0888671875, "reward_std": 0.1269715428352356, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1423.0, "completions/mean_length": 547.94921875, "completions/mean_terminated_length": 539.1080932617188, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.21695813909173012, "epoch": 0.2652782519631273, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11670854473327993, "learning_rate": 9.274796288051956e-07, "loss": 0.0181, "num_tokens": 336751235.0, "reward": 1.07421875, "reward_std": 0.11993157118558884, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.051725368946790695, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 611.6875, "completions/mean_terminated_length": 597.522705078125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.18824738636612892, "epoch": 0.26561966541481735, "frac_reward_zero_std": 0.5, "grad_norm": 0.128357578729789, "learning_rate": 9.271874076622057e-07, "loss": 0.0655, "num_tokens": 337142339.0, "reward": 1.10400390625, "reward_std": 0.17257189750671387, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1572.0, "completions/mean_length": 570.990234375, "completions/mean_terminated_length": 568.0997924804688, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.20349490642547607, "epoch": 0.26596107886650733, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11830315461820912, "learning_rate": 9.268946507644197e-07, "loss": 0.012, "num_tokens": 337513422.0, "reward": 1.25439453125, "reward_std": 0.16666802763938904, "rewards/accuracy_reward/mean": 0.255859375, "rewards/accuracy_reward/std": 0.43676990270614624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1321.0, "completions/mean_length": 534.521484375, "completions/mean_terminated_length": 525.6011962890625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.22920306771993637, "epoch": 0.26630249231819736, "frac_reward_zero_std": 0.53125, "grad_norm": 0.16360844263566354, "learning_rate": 9.266013585276678e-07, "loss": 0.0691, "num_tokens": 337859865.0, "reward": 1.20654296875, "reward_std": 0.17407047748565674, "rewards/accuracy_reward/mean": 0.2177419364452362, "rewards/accuracy_reward/std": 0.41312772035598755, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1804.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 577.775390625, "completions/mean_terminated_length": 577.775390625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.20148683711886406, "epoch": 0.26664390576988733, "frac_reward_zero_std": 0.71875, "grad_norm": 0.09888305945693356, "learning_rate": 9.263075313685405e-07, "loss": -0.009, "num_tokens": 338229254.0, "reward": 1.1640625, "reward_std": 0.09908140450716019, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1938.0, "completions/mean_length": 583.7109375, "completions/mean_terminated_length": 560.46826171875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.21065392345190048, "epoch": 0.2669853192215773, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12004192978550401, "learning_rate": 9.260131697043882e-07, "loss": 0.0583, "num_tokens": 338603074.0, "reward": 1.1015625, "reward_std": 0.13921822607517242, "rewards/accuracy_reward/mean": 0.11693548411130905, "rewards/accuracy_reward/std": 0.32166779041290283, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.09310565888881683, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1754.0, "completions/mean_length": 592.294921875, "completions/mean_terminated_length": 586.5863037109375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.1847468614578247, "epoch": 0.26732673267326734, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08922829123724513, "learning_rate": 9.257182739533203e-07, "loss": 0.0096, "num_tokens": 338980633.0, "reward": 1.1142578125, "reward_std": 0.10302646458148956, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1811.0, "completions/mean_length": 517.046875, "completions/mean_terminated_length": 514.0509033203125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.20588430762290955, "epoch": 0.2676681461249573, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12449713428392054, "learning_rate": 9.254228445342056e-07, "loss": -0.0011, "num_tokens": 339319761.0, "reward": 1.16259765625, "reward_std": 0.16719400882720947, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1815.0, "completions/mean_length": 553.09765625, "completions/mean_terminated_length": 538.3550415039062, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.22195735573768616, "epoch": 0.2680095595766473, "frac_reward_zero_std": 0.78125, "grad_norm": 0.0988426419250602, "learning_rate": 9.251268818666695e-07, "loss": 0.0595, "num_tokens": 339676067.0, "reward": 1.10009765625, "reward_std": 0.076171875, "rewards/accuracy_reward/mean": 0.11088709533214569, "rewards/accuracy_reward/std": 0.3143092691898346, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1419.0, "completions/mean_length": 532.388671875, "completions/mean_terminated_length": 529.4226684570312, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.19688130915164948, "epoch": 0.2683509730283373, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12461629640076581, "learning_rate": 9.248303863710965e-07, "loss": -0.0069, "num_tokens": 340021946.0, "reward": 1.12939453125, "reward_std": 0.18345853686332703, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1929.0, "completions/mean_length": 555.220703125, "completions/mean_terminated_length": 543.466552734375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.19457419961690903, "epoch": 0.2686923864800273, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10046751084929699, "learning_rate": 9.245333584686264e-07, "loss": 0.0441, "num_tokens": 340380635.0, "reward": 1.203125, "reward_std": 0.15379133820533752, "rewards/accuracy_reward/mean": 0.208984375, "rewards/accuracy_reward/std": 0.40698084235191345, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1786.0, "completions/mean_length": 560.130859375, "completions/mean_terminated_length": 554.2960815429688, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.19598126038908958, "epoch": 0.26903379993171733, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11111371673084063, "learning_rate": 9.242357985811562e-07, "loss": -0.0122, "num_tokens": 340742926.0, "reward": 1.1943359375, "reward_std": 0.17403383553028107, "rewards/accuracy_reward/mean": 0.197265625, "rewards/accuracy_reward/std": 0.3983237147331238, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1663.0, "completions/mean_length": 538.70703125, "completions/mean_terminated_length": 535.75341796875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.1889532171189785, "epoch": 0.2693752133834073, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13920289153936677, "learning_rate": 9.239377071313381e-07, "loss": -0.0009, "num_tokens": 341095032.0, "reward": 1.22119140625, "reward_std": 0.18173113465309143, "rewards/accuracy_reward/mean": 0.22265625, "rewards/accuracy_reward/std": 0.41643625497817993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1679.0, "completions/mean_length": 552.98828125, "completions/mean_terminated_length": 544.1768188476562, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.21568544209003448, "epoch": 0.2697166268350973, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12519543180471698, "learning_rate": 9.236390845425797e-07, "loss": 0.0517, "num_tokens": 341452738.0, "reward": 1.18896484375, "reward_std": 0.14507648348808289, "rewards/accuracy_reward/mean": 0.193359375, "rewards/accuracy_reward/std": 0.39531853795051575, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 573.7890625, "completions/mean_terminated_length": 526.2338256835938, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.17812660336494446, "epoch": 0.2700580402867873, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12986864847936774, "learning_rate": 9.233399312390427e-07, "loss": 0.1263, "num_tokens": 341811750.0, "reward": 1.150390625, "reward_std": 0.1985408365726471, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3776407241821289, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.978515625, "rewards/tag_count_reward/std": 0.12326027452945709, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1746.0, "completions/mean_length": 619.681640625, "completions/mean_terminated_length": 608.43505859375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.18037960305809975, "epoch": 0.2703994537384773, "frac_reward_zero_std": 0.625, "grad_norm": 0.10570362807516641, "learning_rate": 9.230402476456424e-07, "loss": 0.0462, "num_tokens": 342205283.0, "reward": 1.091796875, "reward_std": 0.1351180374622345, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1968.0, "completions/mean_length": 585.529296875, "completions/mean_terminated_length": 576.90966796875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.19744813442230225, "epoch": 0.27074086719016727, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10645332546356737, "learning_rate": 9.22740034188048e-07, "loss": 0.0172, "num_tokens": 342584802.0, "reward": 1.18701171875, "reward_std": 0.13024836778640747, "rewards/accuracy_reward/mean": 0.19140625, "rewards/accuracy_reward/std": 0.3937928080558777, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1926.0, "completions/mean_length": 600.884765625, "completions/mean_terminated_length": 557.2333984375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.22130942717194557, "epoch": 0.2710822806418573, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10442149546980864, "learning_rate": 9.224392912926809e-07, "loss": 0.0275, "num_tokens": 342982551.0, "reward": 1.087890625, "reward_std": 0.12897025048732758, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.974609375, "rewards/tag_count_reward/std": 0.1311914473772049, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1605.0, "completions/mean_length": 561.904296875, "completions/mean_terminated_length": 558.99609375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.18751618638634682, "epoch": 0.2714236940935473, "frac_reward_zero_std": 0.5, "grad_norm": 0.13793680244128254, "learning_rate": 9.221380193867144e-07, "loss": -0.0149, "num_tokens": 343338406.0, "reward": 1.1982421875, "reward_std": 0.20155708491802216, "rewards/accuracy_reward/mean": 0.19921875, "rewards/accuracy_reward/std": 0.39980348944664, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.022097086533904076, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1973.0, "completions/mean_length": 605.017578125, "completions/mean_terminated_length": 596.5128173828125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.1865694560110569, "epoch": 0.2717651075452373, "frac_reward_zero_std": 0.5, "grad_norm": 0.11523720997199569, "learning_rate": 9.218362188980732e-07, "loss": 0.0397, "num_tokens": 343724831.0, "reward": 1.10693359375, "reward_std": 0.18303436040878296, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1550.0, "completions/mean_length": 531.0859375, "completions/mean_terminated_length": 522.1453857421875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.19346706569194794, "epoch": 0.2721065209969273, "frac_reward_zero_std": 0.53125, "grad_norm": 0.13140054690331598, "learning_rate": 9.215338902554335e-07, "loss": 0.0385, "num_tokens": 344075339.0, "reward": 1.22998046875, "reward_std": 0.17199741303920746, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42402184009552, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1726.0, "completions/mean_length": 597.78125, "completions/mean_terminated_length": 548.006103515625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.20193799957633018, "epoch": 0.27244793444861726, "frac_reward_zero_std": 0.625, "grad_norm": 0.1159265623874528, "learning_rate": 9.212310338882207e-07, "loss": 0.0127, "num_tokens": 344458683.0, "reward": 1.1396484375, "reward_std": 0.14703604578971863, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9755859375, "rewards/tag_count_reward/std": 0.1304427534341812, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1677.0, "completions/mean_length": 557.0859375, "completions/mean_terminated_length": 554.1682739257812, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.19274992123246193, "epoch": 0.2727893479003073, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12345283339428817, "learning_rate": 9.209276502266102e-07, "loss": 0.0207, "num_tokens": 344814695.0, "reward": 1.19970703125, "reward_std": 0.18014831840991974, "rewards/accuracy_reward/mean": 0.201171875, "rewards/accuracy_reward/std": 0.4012683033943176, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1948.0, "completions/max_terminated_length": 1948.0, "completions/mean_length": 543.541015625, "completions/mean_terminated_length": 543.541015625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.19327308610081673, "epoch": 0.27313076135199726, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11104182617640464, "learning_rate": 9.206237397015267e-07, "loss": -0.01, "num_tokens": 345168892.0, "reward": 1.111328125, "reward_std": 0.11344268918037415, "rewards/accuracy_reward/mean": 0.11491935700178146, "rewards/accuracy_reward/std": 0.3192465901374817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1609.0, "completions/max_terminated_length": 1609.0, "completions/mean_length": 581.111328125, "completions/mean_terminated_length": 581.111328125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.18068860471248627, "epoch": 0.27347217480368724, "frac_reward_zero_std": 0.625, "grad_norm": 0.11241746018593392, "learning_rate": 9.203193027446429e-07, "loss": -0.0079, "num_tokens": 345543477.0, "reward": 1.1552734375, "reward_std": 0.1267768144607544, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.36344730854034424, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.022097086533904076, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1726.0, "completions/mean_length": 585.740234375, "completions/mean_terminated_length": 571.3195190429688, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.15614597871899605, "epoch": 0.27381358825537727, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12808963570161383, "learning_rate": 9.20014339788379e-07, "loss": 0.0642, "num_tokens": 345919472.0, "reward": 1.09228515625, "reward_std": 0.14267151057720184, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1757.0, "completions/mean_length": 535.48828125, "completions/mean_terminated_length": 532.5283813476562, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.2103257030248642, "epoch": 0.27415500170706725, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13169716629940786, "learning_rate": 9.197088512659028e-07, "loss": 0.0148, "num_tokens": 346268154.0, "reward": 1.18212890625, "reward_std": 0.16633784770965576, "rewards/accuracy_reward/mean": 0.181640625, "rewards/accuracy_reward/std": 0.38592514395713806, "rewards/format_reward/mean": 0.001953125, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 549.70703125, "completions/mean_terminated_length": 540.876220703125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.18936654552817345, "epoch": 0.2744964151587573, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13499928299712965, "learning_rate": 9.194028376111284e-07, "loss": 0.0154, "num_tokens": 346621684.0, "reward": 1.15966796875, "reward_std": 0.16566592454910278, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1733.0, "completions/mean_length": 587.587890625, "completions/mean_terminated_length": 576.0885620117188, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.17817265167832375, "epoch": 0.27483782861044725, "frac_reward_zero_std": 0.59375, "grad_norm": 0.101655355390812, "learning_rate": 9.190962992587157e-07, "loss": 0.0416, "num_tokens": 346994993.0, "reward": 1.1865234375, "reward_std": 0.14714175462722778, "rewards/accuracy_reward/mean": 0.193359375, "rewards/accuracy_reward/std": 0.39531853795051575, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06960996240377426, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 528.955078125, "completions/mean_terminated_length": 522.998046875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 0.19862420484423637, "epoch": 0.27517924206213723, "frac_reward_zero_std": 0.75, "grad_norm": 0.08571783662734349, "learning_rate": 9.187892366440702e-07, "loss": 0.0287, "num_tokens": 347348906.0, "reward": 1.0712890625, "reward_std": 0.08587148785591125, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1899.0, "completions/mean_length": 569.24609375, "completions/mean_terminated_length": 548.7584228515625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.20466315001249313, "epoch": 0.27552065551382726, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1317692868788842, "learning_rate": 9.184816502033417e-07, "loss": 0.0082, "num_tokens": 347723144.0, "reward": 1.08349609375, "reward_std": 0.1679530143737793, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97998046875, "rewards/tag_count_reward/std": 0.10706271976232529, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 538.267578125, "completions/mean_terminated_length": 535.3131103515625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.1873331256210804, "epoch": 0.27586206896551724, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09425751733453358, "learning_rate": 9.181735403734241e-07, "loss": 0.0217, "num_tokens": 348073953.0, "reward": 1.16943359375, "reward_std": 0.10258351266384125, "rewards/accuracy_reward/mean": 0.169921875, "rewards/accuracy_reward/std": 0.3759314715862274, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1592.0, "completions/max_terminated_length": 1592.0, "completions/mean_length": 550.76953125, "completions/mean_terminated_length": 550.76953125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.18833984807133675, "epoch": 0.2762034824172072, "frac_reward_zero_std": 0.75, "grad_norm": 0.09765466752605743, "learning_rate": 9.17864907591955e-07, "loss": -0.002, "num_tokens": 348441323.0, "reward": 1.166015625, "reward_std": 0.085491843521595, "rewards/accuracy_reward/mean": 0.166015625, "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1958.0, "completions/mean_length": 522.8046875, "completions/mean_terminated_length": 519.8199462890625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.20850097388029099, "epoch": 0.27654489586889724, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11405815068261581, "learning_rate": 9.175557522973146e-07, "loss": 0.028, "num_tokens": 348782375.0, "reward": 1.07763671875, "reward_std": 0.1076611801981926, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.039800092577934265, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1911.0, "completions/mean_length": 525.0859375, "completions/mean_terminated_length": 516.1100463867188, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.20936140418052673, "epoch": 0.2768863093205872, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12610475079515848, "learning_rate": 9.17246074928625e-07, "loss": 0.0359, "num_tokens": 349128259.0, "reward": 1.10595703125, "reward_std": 0.1049148440361023, "rewards/accuracy_reward/mean": 0.11491935700178146, "rewards/accuracy_reward/std": 0.3192465901374817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.06134068965911865, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1562.0, "completions/mean_length": 475.330078125, "completions/mean_terminated_length": 456.68182373046875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.22587556764483452, "epoch": 0.27722772277227725, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1953815261993142, "learning_rate": 9.169358759257508e-07, "loss": 0.0714, "num_tokens": 349451452.0, "reward": 1.21044921875, "reward_std": 0.17776286602020264, "rewards/accuracy_reward/mean": 0.216796875, "rewards/accuracy_reward/std": 0.4124660789966583, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.06124715134501457, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1691.0, "completions/mean_length": 530.146484375, "completions/mean_terminated_length": 527.1761474609375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.19672177359461784, "epoch": 0.2775691362239672, "frac_reward_zero_std": 0.75, "grad_norm": 0.09957225738474687, "learning_rate": 9.166251557292959e-07, "loss": 0.0137, "num_tokens": 349800439.0, "reward": 1.04150390625, "reward_std": 0.08931578695774078, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 530.853515625, "completions/mean_terminated_length": 524.9039306640625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.19111325219273567, "epoch": 0.2779105496756572, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11209262919468989, "learning_rate": 9.163139147806062e-07, "loss": 0.0198, "num_tokens": 350145820.0, "reward": 1.0986328125, "reward_std": 0.14623546600341797, "rewards/accuracy_reward/mean": 0.10483870655298233, "rewards/accuracy_reward/std": 0.30665475130081177, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1792.0, "completions/mean_length": 560.67578125, "completions/mean_terminated_length": 551.90966796875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.19248675554990768, "epoch": 0.27825196312734723, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10209943552658296, "learning_rate": 9.160021535217661e-07, "loss": 0.0363, "num_tokens": 350515174.0, "reward": 1.12255859375, "reward_std": 0.09247629344463348, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1427.0, "completions/mean_length": 520.296875, "completions/mean_terminated_length": 514.305908203125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.19530781731009483, "epoch": 0.2785933765790372, "frac_reward_zero_std": 0.75, "grad_norm": 0.09917923318246492, "learning_rate": 9.156898723955997e-07, "loss": 0.0247, "num_tokens": 350865150.0, "reward": 1.07958984375, "reward_std": 0.07056179642677307, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1502.0, "completions/mean_length": 522.962890625, "completions/mean_terminated_length": 519.9784545898438, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.19791674613952637, "epoch": 0.2789347900307272, "frac_reward_zero_std": 0.84375, "grad_norm": 0.07795060129581564, "learning_rate": 9.153770718456693e-07, "loss": 0.0155, "num_tokens": 351208907.0, "reward": 1.10595703125, "reward_std": 0.05830947682261467, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1339.0, "completions/mean_length": 473.3046875, "completions/mean_terminated_length": 470.22308349609375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2149548940360546, "epoch": 0.2792762034824172, "frac_reward_zero_std": 0.5, "grad_norm": 0.14745540789395886, "learning_rate": 9.15063752316275e-07, "loss": -0.0001, "num_tokens": 351524311.0, "reward": 1.21728515625, "reward_std": 0.1921226978302002, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41380295157432556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1828.0, "completions/mean_length": 564.33984375, "completions/mean_terminated_length": 552.657470703125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.19027135521173477, "epoch": 0.2796176169341072, "frac_reward_zero_std": 0.5, "grad_norm": 0.13014520404007704, "learning_rate": 9.14749914252454e-07, "loss": 0.0098, "num_tokens": 351894341.0, "reward": 1.142578125, "reward_std": 0.18477213382720947, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1578.0, "completions/mean_length": 512.646484375, "completions/mean_terminated_length": 506.6255187988281, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.1957252323627472, "epoch": 0.2799590303857972, "frac_reward_zero_std": 0.5, "grad_norm": 0.13554751402195309, "learning_rate": 9.144355580999798e-07, "loss": 0.0155, "num_tokens": 352229312.0, "reward": 1.1787109375, "reward_std": 0.19257867336273193, "rewards/accuracy_reward/mean": 0.181640625, "rewards/accuracy_reward/std": 0.38592514395713806, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1768.0, "completions/mean_length": 510.365234375, "completions/mean_terminated_length": 507.3561706542969, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.16942913457751274, "epoch": 0.2803004438374872, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11344162639915154, "learning_rate": 9.141206843053624e-07, "loss": -0.0033, "num_tokens": 352565163.0, "reward": 1.17431640625, "reward_std": 0.13051435351371765, "rewards/accuracy_reward/mean": 0.17578125, "rewards/accuracy_reward/std": 0.3810062110424042, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1718.0, "completions/max_terminated_length": 1718.0, "completions/mean_length": 530.5859375, "completions/mean_terminated_length": 530.5859375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.19175608828663826, "epoch": 0.2806418572891772, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12230452442821038, "learning_rate": 9.138052933158466e-07, "loss": -0.0091, "num_tokens": 352918263.0, "reward": 1.19287109375, "reward_std": 0.146747887134552, "rewards/accuracy_reward/mean": 0.193359375, "rewards/accuracy_reward/std": 0.39531853795051575, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1695.0, "completions/mean_length": 506.50390625, "completions/mean_terminated_length": 503.4872741699219, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.1878349520266056, "epoch": 0.2809832707408672, "frac_reward_zero_std": 0.78125, "grad_norm": 0.10025070655942667, "learning_rate": 9.134893855794118e-07, "loss": 0.0091, "num_tokens": 353257033.0, "reward": 1.14697265625, "reward_std": 0.08857004344463348, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1533.0, "completions/mean_length": 519.85546875, "completions/mean_terminated_length": 504.7850036621094, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.19695154577493668, "epoch": 0.2813246841925572, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12666566828561984, "learning_rate": 9.131729615447715e-07, "loss": 0.0788, "num_tokens": 353605871.0, "reward": 1.19384765625, "reward_std": 0.1393335610628128, "rewards/accuracy_reward/mean": 0.201171875, "rewards/accuracy_reward/std": 0.4012683033943176, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1930.0, "completions/mean_length": 532.9609375, "completions/mean_terminated_length": 521.031494140625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.17955365777015686, "epoch": 0.28166609764424716, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11488225141232544, "learning_rate": 9.128560216613729e-07, "loss": 0.0431, "num_tokens": 353955515.0, "reward": 1.12890625, "reward_std": 0.1361766755580902, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1707.0, "completions/mean_length": 514.94921875, "completions/mean_terminated_length": 511.9491271972656, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.20263485237956047, "epoch": 0.2820075110959372, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10923020434243678, "learning_rate": 9.125385663793951e-07, "loss": 0.0122, "num_tokens": 354294097.0, "reward": 1.09619140625, "reward_std": 0.11599360406398773, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1638.0, "completions/mean_length": 548.1328125, "completions/mean_terminated_length": 496.6545715332031, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.20137200132012367, "epoch": 0.28234892454762717, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11795034354106512, "learning_rate": 9.122205961497502e-07, "loss": 0.0059, "num_tokens": 354667621.0, "reward": 1.16455078125, "reward_std": 0.1239396184682846, "rewards/accuracy_reward/mean": 0.189453125, "rewards/accuracy_reward/std": 0.3922513723373413, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97509765625, "rewards/tag_count_reward/std": 0.13450638949871063, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1520.0, "completions/mean_length": 531.265625, "completions/mean_terminated_length": 525.3176879882812, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.15438692271709442, "epoch": 0.2826903379993172, "frac_reward_zero_std": 0.625, "grad_norm": 0.11146167789609941, "learning_rate": 9.11902111424081e-07, "loss": 0.0336, "num_tokens": 355019789.0, "reward": 1.24169921875, "reward_std": 0.14702624082565308, "rewards/accuracy_reward/mean": 0.244140625, "rewards/accuracy_reward/std": 0.42999663949012756, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.039800092577934265, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 553.689453125, "completions/mean_terminated_length": 547.8294677734375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.18355930596590042, "epoch": 0.2830317514510072, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11729433548510301, "learning_rate": 9.11583112654761e-07, "loss": 0.0421, "num_tokens": 355383006.0, "reward": 1.1064453125, "reward_std": 0.13731440901756287, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1509.0, "completions/max_terminated_length": 1509.0, "completions/mean_length": 531.853515625, "completions/mean_terminated_length": 531.853515625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.1764063537120819, "epoch": 0.28337316490269715, "frac_reward_zero_std": 0.78125, "grad_norm": 0.07835314610599753, "learning_rate": 9.112636002948949e-07, "loss": -0.0126, "num_tokens": 355737075.0, "reward": 1.12109375, "reward_std": 0.09369811415672302, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 601.767578125, "completions/mean_terminated_length": 598.9373779296875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.17761816456913948, "epoch": 0.2837145783543872, "frac_reward_zero_std": 0.625, "grad_norm": 0.10512916032157928, "learning_rate": 9.109435747983158e-07, "loss": 0.0006, "num_tokens": 356119884.0, "reward": 1.07666015625, "reward_std": 0.13476866483688354, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1757.0, "completions/mean_length": 551.5625, "completions/mean_terminated_length": 500.20001220703125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.19862603396177292, "epoch": 0.28405599180607716, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1360476611578834, "learning_rate": 9.106230366195859e-07, "loss": 0.036, "num_tokens": 356483260.0, "reward": 1.19287109375, "reward_std": 0.1652107536792755, "rewards/accuracy_reward/mean": 0.23333333432674408, "rewards/accuracy_reward/std": 0.4233938455581665, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97412109375, "rewards/tag_count_reward/std": 0.13613051176071167, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1578.0, "completions/mean_length": 548.953125, "completions/mean_terminated_length": 546.0195922851562, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.18356797471642494, "epoch": 0.28439740525776713, "frac_reward_zero_std": 0.75, "grad_norm": 0.07959924368917955, "learning_rate": 9.103019862139961e-07, "loss": 0.0053, "num_tokens": 356856612.0, "reward": 1.07666015625, "reward_std": 0.08122949302196503, "rewards/accuracy_reward/mean": 0.08064515888690948, "rewards/accuracy_reward/std": 0.2725643217563629, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1507.0, "completions/mean_length": 474.85546875, "completions/mean_terminated_length": 468.6863098144531, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.20192110911011696, "epoch": 0.28473881870945716, "frac_reward_zero_std": 0.625, "grad_norm": 0.13365236969698177, "learning_rate": 9.099804240375643e-07, "loss": 0.0335, "num_tokens": 357183322.0, "reward": 1.1552734375, "reward_std": 0.1499156355857849, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1776.0, "completions/mean_length": 564.275390625, "completions/mean_terminated_length": 546.6818237304688, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.16410871967673302, "epoch": 0.28508023216114714, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1232247926454664, "learning_rate": 9.096583505470359e-07, "loss": 0.0679, "num_tokens": 357548855.0, "reward": 1.228515625, "reward_std": 0.18515120446681976, "rewards/accuracy_reward/mean": 0.23828125, "rewards/accuracy_reward/std": 0.42644867300987244, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.990234375, "rewards/tag_count_reward/std": 0.08365631848573685, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1733.0, "completions/mean_length": 502.98046875, "completions/mean_terminated_length": 496.9216003417969, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.17694582417607307, "epoch": 0.28542164561283717, "frac_reward_zero_std": 0.5, "grad_norm": 0.14796896019562947, "learning_rate": 9.093357661998817e-07, "loss": 0.0361, "num_tokens": 357885325.0, "reward": 1.2119140625, "reward_std": 0.1782679706811905, "rewards/accuracy_reward/mean": 0.21484375, "rewards/accuracy_reward/std": 0.4111155867576599, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1707.0, "completions/mean_length": 538.73828125, "completions/mean_terminated_length": 535.7847290039062, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.18839161470532417, "epoch": 0.28576305906452715, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12535877313795657, "learning_rate": 9.090126714542989e-07, "loss": 0.0113, "num_tokens": 358236471.0, "reward": 1.18408203125, "reward_std": 0.12944096326828003, "rewards/accuracy_reward/mean": 0.185546875, "rewards/accuracy_reward/std": 0.38912075757980347, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1788.0, "completions/mean_length": 583.98046875, "completions/mean_terminated_length": 578.2392578125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.1695205643773079, "epoch": 0.2861044725162171, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09128568630747375, "learning_rate": 9.086890667692094e-07, "loss": 0.0332, "num_tokens": 358614045.0, "reward": 1.1083984375, "reward_std": 0.11295098811388016, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1316.0, "completions/mean_length": 572.47265625, "completions/mean_terminated_length": 566.686279296875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.15378158912062645, "epoch": 0.28644588596790715, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11062063512596801, "learning_rate": 9.083649526042594e-07, "loss": 0.0191, "num_tokens": 358978239.0, "reward": 1.1923828125, "reward_std": 0.16885687410831451, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3968288004398346, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1708.0, "completions/mean_length": 505.466796875, "completions/mean_terminated_length": 496.3752746582031, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2000807300209999, "epoch": 0.28678729941959713, "frac_reward_zero_std": 0.71875, "grad_norm": 0.1091735712521242, "learning_rate": 9.080403294198188e-07, "loss": 0.0303, "num_tokens": 359310814.0, "reward": 1.10302734375, "reward_std": 0.11469173431396484, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 544.869140625, "completions/mean_terminated_length": 541.9276123046875, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.18757124990224838, "epoch": 0.2871287128712871, "frac_reward_zero_std": 0.46875, "grad_norm": 0.15253984806482754, "learning_rate": 9.077151976769803e-07, "loss": -0.0031, "num_tokens": 359664459.0, "reward": 1.1298828125, "reward_std": 0.19113270938396454, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1828.0, "completions/max_terminated_length": 1828.0, "completions/mean_length": 474.7578125, "completions/mean_terminated_length": 474.7578125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.16476475447416306, "epoch": 0.28747012632297714, "frac_reward_zero_std": 0.5, "grad_norm": 0.1526647306578087, "learning_rate": 9.073895578375593e-07, "loss": 0.0, "num_tokens": 359975167.0, "reward": 1.263671875, "reward_std": 0.18775449693202972, "rewards/accuracy_reward/mean": 0.263671875, "rewards/accuracy_reward/std": 0.4410543739795685, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 500.61328125, "completions/mean_terminated_length": 497.5851135253906, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.1911250725388527, "epoch": 0.2878115397746671, "frac_reward_zero_std": 0.65625, "grad_norm": 0.13603600263647, "learning_rate": 9.070634103640927e-07, "loss": 0.0179, "num_tokens": 360312057.0, "reward": 1.17822265625, "reward_std": 0.11419020593166351, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.38430243730545044, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 567.92578125, "completions/mean_terminated_length": 559.202392578125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.1713889054954052, "epoch": 0.28815295322635714, "frac_reward_zero_std": 0.5, "grad_norm": 0.1346589109492156, "learning_rate": 9.067367557198384e-07, "loss": 0.0112, "num_tokens": 360680163.0, "reward": 1.16845703125, "reward_std": 0.17344634234905243, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3776407241821289, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1751.0, "completions/mean_length": 475.841796875, "completions/mean_terminated_length": 469.6764831542969, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.17400097846984863, "epoch": 0.2884943666780471, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1161127901590818, "learning_rate": 9.064095943687747e-07, "loss": 0.027, "num_tokens": 360998930.0, "reward": 1.1318359375, "reward_std": 0.12692567706108093, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1868.0, "completions/mean_length": 491.5546875, "completions/mean_terminated_length": 488.5087890625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.1792304515838623, "epoch": 0.2888357801297371, "frac_reward_zero_std": 0.6875, "grad_norm": 0.107454604269499, "learning_rate": 9.060819267755999e-07, "loss": 0.0172, "num_tokens": 361323662.0, "reward": 1.15283203125, "reward_std": 0.1301790475845337, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1817.0, "completions/max_terminated_length": 1817.0, "completions/mean_length": 531.40625, "completions/mean_terminated_length": 531.40625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.19227559119462967, "epoch": 0.2891771935814271, "frac_reward_zero_std": 0.625, "grad_norm": 0.11897120221039663, "learning_rate": 9.057537534057311e-07, "loss": -0.0053, "num_tokens": 361671118.0, "reward": 1.1328125, "reward_std": 0.14652761816978455, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 539.185546875, "completions/mean_terminated_length": 536.2328491210938, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 0.17211399972438812, "epoch": 0.2895186070331171, "frac_reward_zero_std": 0.5, "grad_norm": 0.14076584255344063, "learning_rate": 9.054250747253037e-07, "loss": 0.0008, "num_tokens": 362031117.0, "reward": 1.1171875, "reward_std": 0.17532894015312195, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.990234375, "rewards/tag_count_reward/std": 0.05978061258792877, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1908.0, "completions/max_terminated_length": 1908.0, "completions/mean_length": 504.087890625, "completions/mean_terminated_length": 504.087890625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.1929405964910984, "epoch": 0.2898600204848071, "frac_reward_zero_std": 0.625, "grad_norm": 0.11008320486321717, "learning_rate": 9.05095891201171e-07, "loss": -0.0119, "num_tokens": 362365146.0, "reward": 1.20703125, "reward_std": 0.14716270565986633, "rewards/accuracy_reward/mean": 0.20703125, "rewards/accuracy_reward/std": 0.40557438135147095, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1641.0, "completions/max_terminated_length": 1641.0, "completions/mean_length": 514.6640625, "completions/mean_terminated_length": 514.6640625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.19888274371623993, "epoch": 0.2902014339364971, "frac_reward_zero_std": 0.5625, "grad_norm": 0.15503317978240524, "learning_rate": 9.047662033009035e-07, "loss": -0.0015, "num_tokens": 362707918.0, "reward": 1.158203125, "reward_std": 0.17848500609397888, "rewards/accuracy_reward/mean": 0.16330644488334656, "rewards/accuracy_reward/std": 0.37001824378967285, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1496.0, "completions/mean_length": 500.310546875, "completions/mean_terminated_length": 494.2412109375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 0.2003958486020565, "epoch": 0.2905428473881871, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1256239523320237, "learning_rate": 9.044360114927879e-07, "loss": 0.0053, "num_tokens": 363043149.0, "reward": 1.07763671875, "reward_std": 0.12187682092189789, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05285605043172836, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1542.0, "completions/mean_length": 476.435546875, "completions/mean_terminated_length": 470.2725830078125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.2129790037870407, "epoch": 0.2908842608398771, "frac_reward_zero_std": 0.5625, "grad_norm": 0.14901692601132532, "learning_rate": 9.041053162458265e-07, "loss": 0.034, "num_tokens": 363369132.0, "reward": 1.20703125, "reward_std": 0.17001980543136597, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4083731174468994, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.051725368946790695, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1395.0, "completions/mean_length": 547.396484375, "completions/mean_terminated_length": 535.5806884765625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.16200906038284302, "epoch": 0.2912256742915671, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10755547779768403, "learning_rate": 9.037741180297375e-07, "loss": 0.0516, "num_tokens": 363724183.0, "reward": 1.2099609375, "reward_std": 0.12615373730659485, "rewards/accuracy_reward/mean": 0.22379031777381897, "rewards/accuracy_reward/std": 0.41720396280288696, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06960996240377426, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1943.0, "completions/mean_length": 494.9453125, "completions/mean_terminated_length": 491.90606689453125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.18566228449344635, "epoch": 0.29156708774325707, "frac_reward_zero_std": 0.625, "grad_norm": 0.11583852724393923, "learning_rate": 9.034424173149522e-07, "loss": -0.0005, "num_tokens": 364044571.0, "reward": 1.142578125, "reward_std": 0.14260149002075195, "rewards/accuracy_reward/mean": 0.14919355511665344, "rewards/accuracy_reward/std": 0.3566388487815857, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1656.0, "completions/mean_length": 538.6953125, "completions/mean_terminated_length": 529.7996215820312, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.1731671690940857, "epoch": 0.2919085011949471, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13459996473348107, "learning_rate": 9.031102145726168e-07, "loss": 0.0415, "num_tokens": 364394879.0, "reward": 1.08740234375, "reward_std": 0.15743201971054077, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1825.0, "completions/max_terminated_length": 1825.0, "completions/mean_length": 559.921875, "completions/mean_terminated_length": 559.921875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.16252869740128517, "epoch": 0.2922499146466371, "frac_reward_zero_std": 0.75, "grad_norm": 0.09242423773664984, "learning_rate": 9.027775102745899e-07, "loss": 0.0091, "num_tokens": 364760231.0, "reward": 1.1484375, "reward_std": 0.08351518213748932, "rewards/accuracy_reward/mean": 0.15120968222618103, "rewards/accuracy_reward/std": 0.35861483216285706, "rewards/format_reward/mean": 0.001953125, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 620.373046875, "completions/mean_terminated_length": 568.3846435546875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.17066680267453194, "epoch": 0.29259132809832705, "frac_reward_zero_std": 0.53125, "grad_norm": 0.9005626484075733, "learning_rate": 9.02444304893443e-07, "loss": 0.0524, "num_tokens": 365162502.0, "reward": 1.1279296875, "reward_std": 0.16343814134597778, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9736328125, "rewards/tag_count_reward/std": 0.13648539781570435, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 520.388671875, "completions/mean_terminated_length": 517.3992309570312, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.17551983520388603, "epoch": 0.2929327415500171, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1207895397193084, "learning_rate": 9.021105989024589e-07, "loss": 0.011, "num_tokens": 365501677.0, "reward": 1.08251953125, "reward_std": 0.1298096626996994, "rewards/accuracy_reward/mean": 0.08669354766607285, "rewards/accuracy_reward/std": 0.281669557094574, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1937.0, "completions/max_terminated_length": 1937.0, "completions/mean_length": 453.173828125, "completions/mean_terminated_length": 453.173828125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.20672212913632393, "epoch": 0.29327415500170706, "frac_reward_zero_std": 0.8125, "grad_norm": 0.08869639245591798, "learning_rate": 9.017763927756317e-07, "loss": 0.0067, "num_tokens": 365812390.0, "reward": 1.08984375, "reward_std": 0.07582925260066986, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1657.0, "completions/mean_length": 536.1640625, "completions/mean_terminated_length": 524.2598266601562, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.17109840363264084, "epoch": 0.2936155684533971, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09806145711420756, "learning_rate": 9.014416869876658e-07, "loss": 0.0388, "num_tokens": 366159418.0, "reward": 1.1611328125, "reward_std": 0.10742910951375961, "rewards/accuracy_reward/mean": 0.1713709682226181, "rewards/accuracy_reward/std": 0.3772132694721222, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 472.482421875, "completions/mean_terminated_length": 469.3992004394531, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.18335521966218948, "epoch": 0.29395698190508707, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12148235894454464, "learning_rate": 9.011064820139756e-07, "loss": 0.0114, "num_tokens": 366471729.0, "reward": 1.16552734375, "reward_std": 0.1379566490650177, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.001953125, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 494.330078125, "completions/mean_terminated_length": 485.17291259765625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.1871032938361168, "epoch": 0.29429839535677704, "frac_reward_zero_std": 0.625, "grad_norm": 0.11817712117854275, "learning_rate": 9.007707783306837e-07, "loss": 0.041, "num_tokens": 366800858.0, "reward": 1.20849609375, "reward_std": 0.12606769800186157, "rewards/accuracy_reward/mean": 0.212890625, "rewards/accuracy_reward/std": 0.409751296043396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1466.0, "completions/mean_length": 486.318359375, "completions/mean_terminated_length": 483.2622375488281, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.17702339217066765, "epoch": 0.2946398088084671, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12929161315055857, "learning_rate": 9.004345764146223e-07, "loss": 0.0183, "num_tokens": 367122317.0, "reward": 1.23193359375, "reward_std": 0.14087525010108948, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42402184009552, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.039800092577934265, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1686.0, "completions/mean_length": 449.55078125, "completions/mean_terminated_length": 446.4226989746094, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.19113777205348015, "epoch": 0.29498122226015705, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14114419007768153, "learning_rate": 9.000978767433303e-07, "loss": 0.0083, "num_tokens": 367432375.0, "reward": 1.13916015625, "reward_std": 0.14479908347129822, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.001953125, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1323.0, "completions/mean_length": 484.810546875, "completions/mean_terminated_length": 472.501953125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.17834649980068207, "epoch": 0.295322635711847, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1258671504036811, "learning_rate": 8.99760679795054e-07, "loss": 0.0533, "num_tokens": 367758454.0, "reward": 1.099609375, "reward_std": 0.13457119464874268, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1647.0, "completions/mean_length": 457.80078125, "completions/mean_terminated_length": 451.5647277832031, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.18485325202345848, "epoch": 0.29566404916353706, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12127661419817148, "learning_rate": 8.994229860487461e-07, "loss": 0.0208, "num_tokens": 368067936.0, "reward": 1.1201171875, "reward_std": 0.10278713703155518, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.001953125, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2004.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 483.7265625, "completions/mean_terminated_length": 483.7265625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.18263635784387589, "epoch": 0.29600546261522703, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11645198535016144, "learning_rate": 8.990847959840646e-07, "loss": -0.0113, "num_tokens": 368400548.0, "reward": 1.078125, "reward_std": 0.09429663419723511, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 507.62109375, "completions/mean_terminated_length": 495.49212646484375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.18644704297184944, "epoch": 0.296346876066917, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11124811407057669, "learning_rate": 8.987461100813729e-07, "loss": 0.0154, "num_tokens": 368745602.0, "reward": 1.0625, "reward_std": 0.10345719754695892, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04930410906672478, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1389.0, "completions/mean_length": 495.357421875, "completions/mean_terminated_length": 483.13189697265625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.1701282523572445, "epoch": 0.29668828951860704, "frac_reward_zero_std": 0.65625, "grad_norm": 0.17348979030162343, "learning_rate": 8.984069288217385e-07, "loss": 0.0457, "num_tokens": 369076089.0, "reward": 1.125, "reward_std": 0.11574536561965942, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1539.0, "completions/mean_length": 473.734375, "completions/mean_terminated_length": 470.65362548828125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.16245967149734497, "epoch": 0.297029702970297, "frac_reward_zero_std": 0.4375, "grad_norm": 0.31482670845613825, "learning_rate": 8.980672526869323e-07, "loss": 0.0216, "num_tokens": 369388081.0, "reward": 1.27685546875, "reward_std": 0.21315237879753113, "rewards/accuracy_reward/mean": 0.248046875, "rewards/accuracy_reward/std": 0.4323015511035919, "rewards/format_reward/mean": 0.029296875, "rewards/format_reward/std": 0.16880230605602264, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1663.0, "completions/mean_length": 500.298828125, "completions/mean_terminated_length": 497.2700500488281, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.17655064910650253, "epoch": 0.29737111642198705, "frac_reward_zero_std": 0.28125, "grad_norm": 0.8949279542323872, "learning_rate": 8.977270821594285e-07, "loss": 0.0415, "num_tokens": 369722570.0, "reward": 1.2890625, "reward_std": 0.2757694125175476, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.197265625, "rewards/format_reward/std": 0.3983237147331238, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1289.0, "completions/mean_length": 444.5546875, "completions/mean_terminated_length": 435.1041564941406, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.1836530715227127, "epoch": 0.297712529873677, "frac_reward_zero_std": 0.40625, "grad_norm": 0.5978004015913448, "learning_rate": 8.973864177224031e-07, "loss": 0.0983, "num_tokens": 370027494.0, "reward": 2.04052734375, "reward_std": 0.27171242237091064, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.9140625, "rewards/format_reward/std": 0.28054583072662354, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1708.0, "completions/max_terminated_length": 1708.0, "completions/mean_length": 461.166015625, "completions/mean_terminated_length": 461.166015625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.18168045207858086, "epoch": 0.298053943325367, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11750994097903614, "learning_rate": 8.970452598597341e-07, "loss": -0.0154, "num_tokens": 370340443.0, "reward": 2.1982421875, "reward_std": 0.11126624792814255, "rewards/accuracy_reward/mean": 0.205078125, "rewards/accuracy_reward/std": 0.4041535556316376, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.022097086533904076, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 462.142578125, "completions/mean_terminated_length": 459.03912353515625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.1851659119129181, "epoch": 0.29839535677705703, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12144005128411896, "learning_rate": 8.967036090560001e-07, "loss": 0.0371, "num_tokens": 370649604.0, "reward": 2.0986328125, "reward_std": 0.13095206022262573, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1387.0, "completions/mean_length": 455.01953125, "completions/mean_terminated_length": 445.63067626953125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.20208384841680527, "epoch": 0.298736770228747, "frac_reward_zero_std": 0.625, "grad_norm": 0.13642387008909462, "learning_rate": 8.963614657964798e-07, "loss": 0.057, "num_tokens": 370974798.0, "reward": 2.14892578125, "reward_std": 0.16094918549060822, "rewards/accuracy_reward/mean": 0.162109375, "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.06134068965911865, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1882.0, "completions/max_terminated_length": 1882.0, "completions/mean_length": 461.501953125, "completions/mean_terminated_length": 461.501953125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.16574259474873543, "epoch": 0.299078183680437, "frac_reward_zero_std": 0.75, "grad_norm": 0.12005912540827099, "learning_rate": 8.960188305671515e-07, "loss": -0.0273, "num_tokens": 371284159.0, "reward": 2.12841796875, "reward_std": 0.09741386771202087, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1652.0, "completions/max_terminated_length": 1652.0, "completions/mean_length": 424.97265625, "completions/mean_terminated_length": 424.97265625, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.18017087876796722, "epoch": 0.299419597132127, "frac_reward_zero_std": 0.71875, "grad_norm": 0.16824576352726062, "learning_rate": 8.956757038546925e-07, "loss": -0.0249, "num_tokens": 371582017.0, "reward": 2.07470703125, "reward_std": 0.11881691217422485, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.024685947224497795, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1852.0, "completions/mean_length": 451.775390625, "completions/mean_terminated_length": 445.5157165527344, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.18747851997613907, "epoch": 0.299761010583817, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1292253455762975, "learning_rate": 8.953320861464777e-07, "loss": 0.0286, "num_tokens": 371888238.0, "reward": 2.048828125, "reward_std": 0.12337133288383484, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.060289934277534485, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 451.076171875, "completions/mean_terminated_length": 435.3352966308594, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.19283964484930038, "epoch": 0.300102424035507, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1795013569036322, "learning_rate": 8.949879779305801e-07, "loss": 0.0339, "num_tokens": 372206837.0, "reward": 2.1044921875, "reward_std": 0.16927212476730347, "rewards/accuracy_reward/mean": 0.16733871400356293, "rewards/accuracy_reward/std": 0.37365487217903137, "rewards/format_reward/mean": 0.962890625, "rewards/format_reward/std": 0.18921469151973724, "rewards/tag_count_reward/mean": 0.9794921875, "rewards/tag_count_reward/std": 0.10639684647321701, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1207.0, "completions/mean_length": 482.478515625, "completions/mean_terminated_length": 422.17645263671875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.21379728615283966, "epoch": 0.300443837487197, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1561254192921303, "learning_rate": 8.946433796957683e-07, "loss": 0.0553, "num_tokens": 372550106.0, "reward": 2.03125, "reward_std": 0.1737637221813202, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.958984375, "rewards/format_reward/std": 0.19852031767368317, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1467188447713852, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1125.0, "completions/mean_length": 385.259765625, "completions/mean_terminated_length": 382.005859375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.20989608019590378, "epoch": 0.30078525093888697, "frac_reward_zero_std": 0.5, "grad_norm": 0.18832762070616912, "learning_rate": 8.942982919315083e-07, "loss": 0.0228, "num_tokens": 372819135.0, "reward": 2.1845703125, "reward_std": 0.18845278024673462, "rewards/accuracy_reward/mean": 0.193359375, "rewards/accuracy_reward/std": 0.39531853795051575, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1275.0, "completions/max_terminated_length": 1275.0, "completions/mean_length": 413.068359375, "completions/mean_terminated_length": 413.068359375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.2099032886326313, "epoch": 0.301126664390577, "frac_reward_zero_std": 0.46875, "grad_norm": 0.2036722822574638, "learning_rate": 8.939527151279606e-07, "loss": -0.0195, "num_tokens": 373110338.0, "reward": 2.17138671875, "reward_std": 0.22276534140110016, "rewards/accuracy_reward/mean": 0.19140625, "rewards/accuracy_reward/std": 0.3937928080558777, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.05493048578500748, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2016.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 445.091796875, "completions/mean_terminated_length": 445.091796875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.1592264138162136, "epoch": 0.301468077842267, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1532872828081225, "learning_rate": 8.9360664977598e-07, "loss": -0.0057, "num_tokens": 373417041.0, "reward": 2.14306640625, "reward_std": 0.15397778153419495, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1538.0, "completions/max_terminated_length": 1538.0, "completions/mean_length": 435.064453125, "completions/mean_terminated_length": 435.064453125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.185231264680624, "epoch": 0.30180949129395696, "frac_reward_zero_std": 0.625, "grad_norm": 0.15632800490400164, "learning_rate": 8.932600963671164e-07, "loss": -0.0231, "num_tokens": 373718162.0, "reward": 2.12890625, "reward_std": 0.1654873639345169, "rewards/accuracy_reward/mean": 0.1391129046678543, "rewards/accuracy_reward/std": 0.34641367197036743, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03121940791606903, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1811.0, "completions/mean_length": 448.1484375, "completions/mean_terminated_length": 445.0176086425781, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.16704745590686798, "epoch": 0.302150904745647, "frac_reward_zero_std": 0.4375, "grad_norm": 0.17320807004183572, "learning_rate": 8.92913055393612e-07, "loss": -0.02, "num_tokens": 374021838.0, "reward": 2.20556640625, "reward_std": 0.22038090229034424, "rewards/accuracy_reward/mean": 0.22177419066429138, "rewards/accuracy_reward/std": 0.4158594012260437, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.052765581756830215, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 378.19140625, "completions/mean_terminated_length": 378.19140625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.21957074105739594, "epoch": 0.30249231819733696, "frac_reward_zero_std": 0.53125, "grad_norm": 0.20846991061290823, "learning_rate": 8.925655273484015e-07, "loss": -0.0094, "num_tokens": 374290896.0, "reward": 2.1279296875, "reward_std": 0.19645589590072632, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.041276250034570694, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2047.0, "completions/max_terminated_length": 1176.0, "completions/mean_length": 469.38671875, "completions/mean_terminated_length": 418.4959411621094, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.21033518761396408, "epoch": 0.302833731649027, "frac_reward_zero_std": 0.5625, "grad_norm": 0.16552751199712834, "learning_rate": 8.922175127251119e-07, "loss": 0.0053, "num_tokens": 374618118.0, "reward": 2.11572265625, "reward_std": 0.15325337648391724, "rewards/accuracy_reward/mean": 0.169921875, "rewards/accuracy_reward/std": 0.3759314715862274, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17416280508041382, "rewards/tag_count_reward/mean": 0.97705078125, "rewards/tag_count_reward/std": 0.12834827601909637, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1226.0, "completions/mean_length": 412.111328125, "completions/mean_terminated_length": 408.90997314453125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.2035425305366516, "epoch": 0.30317514510071697, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1297991807113189, "learning_rate": 8.918690120180612e-07, "loss": 0.0152, "num_tokens": 374914799.0, "reward": 2.08447265625, "reward_std": 0.1316978633403778, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1482.0, "completions/mean_length": 409.650390625, "completions/mean_terminated_length": 399.9941101074219, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.2152133397758007, "epoch": 0.30351655855240695, "frac_reward_zero_std": 0.59375, "grad_norm": 0.15079536704892127, "learning_rate": 8.915200257222579e-07, "loss": 0.0541, "num_tokens": 375198924.0, "reward": 2.07666015625, "reward_std": 0.15204323828220367, "rewards/accuracy_reward/mean": 0.09072580933570862, "rewards/accuracy_reward/std": 0.2875087857246399, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.06134068965911865, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1769.0, "completions/mean_length": 406.69921875, "completions/mean_terminated_length": 403.4872741699219, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.203338410705328, "epoch": 0.303857972004097, "frac_reward_zero_std": 0.6875, "grad_norm": 0.14972422649426226, "learning_rate": 8.911705543333998e-07, "loss": 0.0326, "num_tokens": 375478402.0, "reward": 2.1767578125, "reward_std": 0.10994576662778854, "rewards/accuracy_reward/mean": 0.18359375, "rewards/accuracy_reward/std": 0.3875311613082886, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.041276250034570694, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1082.0, "completions/mean_length": 417.9765625, "completions/mean_terminated_length": 414.78668212890625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.20910431444644928, "epoch": 0.30419938545578695, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14310275647471554, "learning_rate": 8.908205983478742e-07, "loss": 0.0054, "num_tokens": 375769478.0, "reward": 2.16357421875, "reward_std": 0.16772881150245667, "rewards/accuracy_reward/mean": 0.16796875, "rewards/accuracy_reward/std": 0.374204158782959, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.039800092577934265, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1843.0, "completions/max_terminated_length": 1843.0, "completions/mean_length": 457.587890625, "completions/mean_terminated_length": 457.587890625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.17774154618382454, "epoch": 0.30454079890747693, "frac_reward_zero_std": 0.78125, "grad_norm": 0.10082402842372352, "learning_rate": 8.904701582627566e-07, "loss": -0.0009, "num_tokens": 376070899.0, "reward": 2.107421875, "reward_std": 0.07729348540306091, "rewards/accuracy_reward/mean": 0.11088709533214569, "rewards/accuracy_reward/std": 0.314309298992157, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1303.0, "completions/mean_length": 430.283203125, "completions/mean_terminated_length": 427.1174011230469, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.21409542113542557, "epoch": 0.30488221235916696, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12089494993115801, "learning_rate": 8.901192345758098e-07, "loss": 0.0137, "num_tokens": 376368596.0, "reward": 2.1005859375, "reward_std": 0.14542534947395325, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1176.0, "completions/mean_length": 486.34765625, "completions/mean_terminated_length": 483.2915954589844, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.17036619409918785, "epoch": 0.30522362581085694, "frac_reward_zero_std": 0.40625, "grad_norm": 0.16923141751200652, "learning_rate": 8.897678277854837e-07, "loss": -0.0199, "num_tokens": 376699334.0, "reward": 2.16162109375, "reward_std": 0.2390979826450348, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 449.8203125, "completions/mean_terminated_length": 443.552978515625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.17452074959874153, "epoch": 0.30556503926254697, "frac_reward_zero_std": 0.53125, "grad_norm": 0.14926627899616673, "learning_rate": 8.894159383909151e-07, "loss": 0.0346, "num_tokens": 376998298.0, "reward": 2.208984375, "reward_std": 0.15765544772148132, "rewards/accuracy_reward/mean": 0.216796875, "rewards/accuracy_reward/std": 0.4124660789966583, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.051725368946790695, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 499.908203125, "completions/mean_terminated_length": 493.8372802734375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.18523309752345085, "epoch": 0.30590645271423694, "frac_reward_zero_std": 0.78125, "grad_norm": 0.09890713872356467, "learning_rate": 8.890635668919249e-07, "loss": 0.0255, "num_tokens": 377327115.0, "reward": 2.1005859375, "reward_std": 0.09748993813991547, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1867.0, "completions/mean_length": 479.0703125, "completions/mean_terminated_length": 472.91766357421875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.18856700137257576, "epoch": 0.3062478661659269, "frac_reward_zero_std": 0.71875, "grad_norm": 0.12996035555019333, "learning_rate": 8.887107137890202e-07, "loss": 0.0339, "num_tokens": 377661823.0, "reward": 2.130859375, "reward_std": 0.11437931656837463, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.051725368946790695, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1695.0, "completions/mean_length": 463.431640625, "completions/mean_terminated_length": 457.2176818847656, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.18678270280361176, "epoch": 0.30658927961761695, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1243976321004675, "learning_rate": 8.883573795833909e-07, "loss": 0.0179, "num_tokens": 377971388.0, "reward": 2.1416015625, "reward_std": 0.13610677421092987, "rewards/accuracy_reward/mean": 0.15833333134651184, "rewards/accuracy_reward/std": 0.3654341399669647, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1609.0, "completions/mean_length": 492.4609375, "completions/mean_terminated_length": 486.3608093261719, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.17896729707717896, "epoch": 0.3069306930693069, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13694954506773951, "learning_rate": 8.88003564776911e-07, "loss": 0.0021, "num_tokens": 378299336.0, "reward": 2.1220703125, "reward_std": 0.17772163450717926, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1554.0, "completions/mean_length": 465.474609375, "completions/mean_terminated_length": 462.377685546875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.1923762522637844, "epoch": 0.3072721065209969, "frac_reward_zero_std": 0.75, "grad_norm": 0.11486243068141179, "learning_rate": 8.876492698721374e-07, "loss": 0.0156, "num_tokens": 378608235.0, "reward": 2.0625, "reward_std": 0.10572798550128937, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1914.0, "completions/mean_length": 526.00390625, "completions/mean_terminated_length": 514.0196533203125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.18826928734779358, "epoch": 0.30761351997268693, "frac_reward_zero_std": 0.71875, "grad_norm": 0.10893932400945538, "learning_rate": 8.872944953723079e-07, "loss": 0.0142, "num_tokens": 378963565.0, "reward": 2.017578125, "reward_std": 0.11107920855283737, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 471.359375, "completions/mean_terminated_length": 462.06878662109375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.20920706167817116, "epoch": 0.3079549334243769, "frac_reward_zero_std": 0.5625, "grad_norm": 0.17195241568697187, "learning_rate": 8.869392417813427e-07, "loss": 0.0423, "num_tokens": 379287509.0, "reward": 2.09033203125, "reward_std": 0.14370688796043396, "rewards/accuracy_reward/mean": 0.14919355511665344, "rewards/accuracy_reward/std": 0.3566388487815857, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.1843547374010086, "rewards/tag_count_reward/mean": 0.98095703125, "rewards/tag_count_reward/std": 0.10257764905691147, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1480.0, "completions/mean_length": 445.998046875, "completions/mean_terminated_length": 433.38385009765625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.21632696688175201, "epoch": 0.30829634687606694, "frac_reward_zero_std": 0.625, "grad_norm": 0.1352676265767565, "learning_rate": 8.865835096038413e-07, "loss": 0.0512, "num_tokens": 379595764.0, "reward": 2.1484375, "reward_std": 0.14076435565948486, "rewards/accuracy_reward/mean": 0.162109375, "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 488.263671875, "completions/mean_terminated_length": 479.07073974609375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.19733862578868866, "epoch": 0.3086377603277569, "frac_reward_zero_std": 0.71875, "grad_norm": 0.09859860671848746, "learning_rate": 8.862272993450842e-07, "loss": 0.0438, "num_tokens": 379927723.0, "reward": 2.14599609375, "reward_std": 0.1434323638677597, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.36344730854034424, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1625.0, "completions/mean_length": 483.8359375, "completions/mean_terminated_length": 477.7019958496094, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.18045825138688087, "epoch": 0.3089791737794469, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11458057569895257, "learning_rate": 8.858706115110301e-07, "loss": 0.0304, "num_tokens": 380250343.0, "reward": 2.1884765625, "reward_std": 0.14120376110076904, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3968288004398346, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1447.0, "completions/mean_length": 455.353515625, "completions/mean_terminated_length": 445.96661376953125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.20378128811717033, "epoch": 0.3093205872311369, "frac_reward_zero_std": 0.65625, "grad_norm": 0.14261872521373156, "learning_rate": 8.855134466083165e-07, "loss": 0.0531, "num_tokens": 380564188.0, "reward": 2.09814453125, "reward_std": 0.1332738697528839, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.06134068965911865, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1405.0, "completions/mean_length": 515.52734375, "completions/mean_terminated_length": 512.5283813476562, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.17249465733766556, "epoch": 0.3096620006828269, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0920217783329204, "learning_rate": 8.851558051442581e-07, "loss": 0.0085, "num_tokens": 380905930.0, "reward": 2.12548828125, "reward_std": 0.140555739402771, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1721.0, "completions/mean_length": 461.2734375, "completions/mean_terminated_length": 445.6331481933594, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.20366620272397995, "epoch": 0.3100034141345169, "frac_reward_zero_std": 0.5, "grad_norm": 0.16600896522441816, "learning_rate": 8.847976876268467e-07, "loss": -0.0111, "num_tokens": 381219142.0, "reward": 2.16845703125, "reward_std": 0.2087763249874115, "rewards/accuracy_reward/mean": 0.2265625, "rewards/accuracy_reward/std": 0.4190165400505066, "rewards/format_reward/mean": 0.962890625, "rewards/format_reward/std": 0.18921469151973724, "rewards/tag_count_reward/mean": 0.97900390625, "rewards/tag_count_reward/std": 0.10687512904405594, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1841.0, "completions/max_terminated_length": 1841.0, "completions/mean_length": 479.724609375, "completions/mean_terminated_length": 479.724609375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.18279984593391418, "epoch": 0.3103448275862069, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13305278871860218, "learning_rate": 8.844390945647507e-07, "loss": -0.017, "num_tokens": 381538345.0, "reward": 2.16796875, "reward_std": 0.15748226642608643, "rewards/accuracy_reward/mean": 0.1733870953321457, "rewards/accuracy_reward/std": 0.37896376848220825, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1794.0, "completions/mean_length": 494.65625, "completions/mean_terminated_length": 491.6164245605469, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.18378398939967155, "epoch": 0.3106862410378969, "frac_reward_zero_std": 0.625, "grad_norm": 0.1357560113797225, "learning_rate": 8.840800264673133e-07, "loss": -0.0003, "num_tokens": 381865305.0, "reward": 2.15478515625, "reward_std": 0.17197220027446747, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2047.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 525.638671875, "completions/mean_terminated_length": 476.5625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.2065976783633232, "epoch": 0.3110276544895869, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1288909507644223, "learning_rate": 8.837204838445528e-07, "loss": 0.0055, "num_tokens": 382214976.0, "reward": 2.01708984375, "reward_std": 0.12233699858188629, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.966796875, "rewards/format_reward/std": 0.17934183776378632, "rewards/tag_count_reward/mean": 0.97607421875, "rewards/tag_count_reward/std": 0.13100102543830872, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1316.0, "completions/max_terminated_length": 1316.0, "completions/mean_length": 432.732421875, "completions/mean_terminated_length": 432.732421875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.22698868066072464, "epoch": 0.3113690679412769, "frac_reward_zero_std": 0.84375, "grad_norm": 0.10600335442244435, "learning_rate": 8.833604672071616e-07, "loss": -0.0203, "num_tokens": 382512007.0, "reward": 2.13623046875, "reward_std": 0.057026512920856476, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1748.0, "completions/mean_length": 441.962890625, "completions/mean_terminated_length": 435.66473388671875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.18586091324687004, "epoch": 0.31171048139296686, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1236898050982683, "learning_rate": 8.829999770665051e-07, "loss": 0.0521, "num_tokens": 382811188.0, "reward": 2.1376953125, "reward_std": 0.1317146122455597, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 462.34765625, "completions/mean_terminated_length": 459.24462890625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.21211522817611694, "epoch": 0.3120518948446569, "frac_reward_zero_std": 0.84375, "grad_norm": 0.0964492318986652, "learning_rate": 8.826390139346213e-07, "loss": 0.0055, "num_tokens": 383130662.0, "reward": 2.04150390625, "reward_std": 0.06808003783226013, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1818.0, "completions/mean_length": 485.28515625, "completions/mean_terminated_length": 479.1568908691406, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.20366811007261276, "epoch": 0.31239330829634687, "frac_reward_zero_std": 0.65625, "grad_norm": 0.13236227053377095, "learning_rate": 8.822775783242204e-07, "loss": 0.0262, "num_tokens": 383460696.0, "reward": 2.09130859375, "reward_std": 0.13397064805030823, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1499.0, "completions/mean_length": 528.666015625, "completions/mean_terminated_length": 473.33807373046875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.2050783447921276, "epoch": 0.31273472174803685, "frac_reward_zero_std": 0.59375, "grad_norm": 0.20509168402794162, "learning_rate": 8.819156707486831e-07, "loss": 0.0035, "num_tokens": 383822701.0, "reward": 2.06494140625, "reward_std": 0.17989492416381836, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21157780289649963, "rewards/tag_count_reward/mean": 0.97314453125, "rewards/tag_count_reward/std": 0.1386137455701828, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1864.0, "completions/mean_length": 491.6171875, "completions/mean_terminated_length": 485.5137634277344, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.17971624433994293, "epoch": 0.3130761351997269, "frac_reward_zero_std": 0.71875, "grad_norm": 0.09761229340010283, "learning_rate": 8.81553291722061e-07, "loss": 0.0344, "num_tokens": 384151609.0, "reward": 2.1005859375, "reward_std": 0.13063876330852509, "rewards/accuracy_reward/mean": 0.11088709533214569, "rewards/accuracy_reward/std": 0.3143092691898346, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 525.5859375, "completions/mean_terminated_length": 522.6066284179688, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.1539120115339756, "epoch": 0.31341754865141686, "frac_reward_zero_std": 0.40625, "grad_norm": 0.14427209457710846, "learning_rate": 8.811904417590752e-07, "loss": 0.0024, "num_tokens": 384494501.0, "reward": 2.220703125, "reward_std": 0.2564806342124939, "rewards/accuracy_reward/mean": 0.2265625, "rewards/accuracy_reward/std": 0.4190165400505066, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1568.0, "completions/mean_length": 455.45703125, "completions/mean_terminated_length": 452.34051513671875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.18818305432796478, "epoch": 0.3137589621031069, "frac_reward_zero_std": 0.78125, "grad_norm": 0.1036919740568226, "learning_rate": 8.808271213751157e-07, "loss": 0.0187, "num_tokens": 384804671.0, "reward": 2.09619140625, "reward_std": 0.09665633738040924, "rewards/accuracy_reward/mean": 0.10282257944345474, "rewards/accuracy_reward/std": 0.30403366684913635, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 507.431640625, "completions/mean_terminated_length": 498.3516845703125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.16770068556070328, "epoch": 0.31410037555479686, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12158286033675623, "learning_rate": 8.804633310862404e-07, "loss": 0.0247, "num_tokens": 385139884.0, "reward": 2.14794921875, "reward_std": 0.18123675882816315, "rewards/accuracy_reward/mean": 0.16330644488334656, "rewards/accuracy_reward/std": 0.37001824378967285, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1480.0, "completions/mean_length": 481.4453125, "completions/mean_terminated_length": 478.379638671875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.19467469304800034, "epoch": 0.31444178900648684, "frac_reward_zero_std": 0.5, "grad_norm": 0.1456446864343763, "learning_rate": 8.80099071409175e-07, "loss": -0.0159, "num_tokens": 385467824.0, "reward": 2.23681640625, "reward_std": 0.19557496905326843, "rewards/accuracy_reward/mean": 0.240234375, "rewards/accuracy_reward/std": 0.4276435375213623, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1619.0, "completions/max_terminated_length": 1619.0, "completions/mean_length": 508.421875, "completions/mean_terminated_length": 508.421875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.15697640553116798, "epoch": 0.31478320245817687, "frac_reward_zero_std": 0.625, "grad_norm": 0.12439516175008239, "learning_rate": 8.797343428613121e-07, "loss": -0.0095, "num_tokens": 385812840.0, "reward": 2.1904296875, "reward_std": 0.13825760781764984, "rewards/accuracy_reward/mean": 0.19140625, "rewards/accuracy_reward/std": 0.3937928080558777, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.022097086533904076, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1731.0, "completions/mean_length": 469.66796875, "completions/mean_terminated_length": 466.5792541503906, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.18283028900623322, "epoch": 0.31512461590986685, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11034300882995426, "learning_rate": 8.793691459607097e-07, "loss": 0.0202, "num_tokens": 386131534.0, "reward": 2.07470703125, "reward_std": 0.10219144076108932, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1600.0, "completions/mean_length": 512.970703125, "completions/mean_terminated_length": 503.92340087890625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.16689687594771385, "epoch": 0.3154660293615568, "frac_reward_zero_std": 0.71875, "grad_norm": 0.10897559093659509, "learning_rate": 8.790034812260915e-07, "loss": 0.023, "num_tokens": 386466591.0, "reward": 2.04638671875, "reward_std": 0.11436519771814346, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1609.0, "completions/mean_length": 522.25, "completions/mean_terminated_length": 510.2362060546875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.17506720125675201, "epoch": 0.31580744281324685, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13176133613030275, "learning_rate": 8.786373491768456e-07, "loss": 0.0538, "num_tokens": 386806911.0, "reward": 2.0703125, "reward_std": 0.16778971254825592, "rewards/accuracy_reward/mean": 0.08669354766607285, "rewards/accuracy_reward/std": 0.281669557094574, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1813.0, "completions/mean_length": 536.357421875, "completions/mean_terminated_length": 527.451904296875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.1726885661482811, "epoch": 0.31614885626493683, "frac_reward_zero_std": 0.65625, "grad_norm": 0.13859235914213935, "learning_rate": 8.782707503330235e-07, "loss": 0.0229, "num_tokens": 387163030.0, "reward": 2.0888671875, "reward_std": 0.13163664937019348, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.1843547374010086, "rewards/tag_count_reward/mean": 0.9814453125, "rewards/tag_count_reward/std": 0.0996444821357727, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 550.232421875, "completions/mean_terminated_length": 505.05230712890625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.17381389066576958, "epoch": 0.31649026971662686, "frac_reward_zero_std": 0.625, "grad_norm": 0.09332946232117804, "learning_rate": 8.779036852153406e-07, "loss": 0.0484, "num_tokens": 387530349.0, "reward": 2.03857421875, "reward_std": 0.14367565512657166, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.962890625, "rewards/format_reward/std": 0.18921469151973724, "rewards/tag_count_reward/mean": 0.97412109375, "rewards/tag_count_reward/std": 0.13340787589550018, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1329.0, "completions/max_terminated_length": 1329.0, "completions/mean_length": 459.1875, "completions/mean_terminated_length": 459.1875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.18395094573497772, "epoch": 0.31683168316831684, "frac_reward_zero_std": 0.625, "grad_norm": 0.13158782276413833, "learning_rate": 8.775361543451735e-07, "loss": 0.0044, "num_tokens": 387847549.0, "reward": 2.1591796875, "reward_std": 0.15397028625011444, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1582.0, "completions/mean_length": 517.912109375, "completions/mean_terminated_length": 511.91180419921875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.15198392793536186, "epoch": 0.3171730966200068, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11361987734056582, "learning_rate": 8.771681582445612e-07, "loss": 0.0416, "num_tokens": 388185600.0, "reward": 2.1591796875, "reward_std": 0.16760700941085815, "rewards/accuracy_reward/mean": 0.166015625, "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1881.0, "completions/max_terminated_length": 1881.0, "completions/mean_length": 460.3671875, "completions/mean_terminated_length": 460.3671875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.1881105937063694, "epoch": 0.31751451007169684, "frac_reward_zero_std": 0.78125, "grad_norm": 0.11075314183963023, "learning_rate": 8.767996974362034e-07, "loss": -0.0096, "num_tokens": 388497148.0, "reward": 2.0966796875, "reward_std": 0.0791100487112999, "rewards/accuracy_reward/mean": 0.10080645233392715, "rewards/accuracy_reward/std": 0.30137622356414795, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.022097086533904076, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1285.0, "completions/max_terminated_length": 1285.0, "completions/mean_length": 440.470703125, "completions/mean_terminated_length": 440.470703125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.20820658281445503, "epoch": 0.3178559235233868, "frac_reward_zero_std": 0.65625, "grad_norm": 0.14918209205171365, "learning_rate": 8.764307724434592e-07, "loss": -0.0084, "num_tokens": 388807741.0, "reward": 2.169921875, "reward_std": 0.1331133246421814, "rewards/accuracy_reward/mean": 0.169921875, "rewards/accuracy_reward/std": 0.3759314715862274, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1570.0, "completions/mean_length": 496.39453125, "completions/mean_terminated_length": 490.3098449707031, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.16531770303845406, "epoch": 0.3181973369750768, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12613571932578294, "learning_rate": 8.76061383790348e-07, "loss": 0.0228, "num_tokens": 389136919.0, "reward": 2.134765625, "reward_std": 0.1466817855834961, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1429.0, "completions/mean_length": 454.5546875, "completions/mean_terminated_length": 451.4364013671875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.17415719851851463, "epoch": 0.3185387504267668, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1493423196304644, "learning_rate": 8.75691532001547e-07, "loss": 0.0122, "num_tokens": 389444899.0, "reward": 2.13916015625, "reward_std": 0.16677552461624146, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1561.0, "completions/mean_length": 517.44140625, "completions/mean_terminated_length": 502.3471374511719, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.16182183101773262, "epoch": 0.3188801638784568, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12870230184989212, "learning_rate": 8.753212176023914e-07, "loss": 0.0637, "num_tokens": 389786853.0, "reward": 2.10595703125, "reward_std": 0.16706153750419617, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1731.0, "completions/mean_length": 475.59375, "completions/mean_terminated_length": 450.6488342285156, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.20563166961073875, "epoch": 0.31922157733014683, "frac_reward_zero_std": 0.53125, "grad_norm": 0.17250786398891024, "learning_rate": 8.74950441118874e-07, "loss": 0.0182, "num_tokens": 390116421.0, "reward": 2.1787109375, "reward_std": 0.15981777012348175, "rewards/accuracy_reward/mean": 0.23828125, "rewards/accuracy_reward/std": 0.42644867300987244, "rewards/format_reward/mean": 0.962890625, "rewards/format_reward/std": 0.18921469151973724, "rewards/tag_count_reward/mean": 0.9775390625, "rewards/tag_count_reward/std": 0.11592138558626175, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1629.0, "completions/mean_length": 499.23828125, "completions/mean_terminated_length": 493.16473388671875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.17671950906515121, "epoch": 0.3195629907818368, "frac_reward_zero_std": 0.53125, "grad_norm": 0.16004288578049433, "learning_rate": 8.745792030776433e-07, "loss": 0.0204, "num_tokens": 390454383.0, "reward": 2.1279296875, "reward_std": 0.20981693267822266, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1307.0, "completions/mean_length": 441.255859375, "completions/mean_terminated_length": 428.6043395996094, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.2144823558628559, "epoch": 0.3199044042335268, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1350116058668752, "learning_rate": 8.742075040060037e-07, "loss": 0.0263, "num_tokens": 390758658.0, "reward": 2.115234375, "reward_std": 0.15091589093208313, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1819.0, "completions/max_terminated_length": 1819.0, "completions/mean_length": 491.2734375, "completions/mean_terminated_length": 491.2734375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.17827314883470535, "epoch": 0.3202458176852168, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1586573379855675, "learning_rate": 8.738353444319146e-07, "loss": -0.0106, "num_tokens": 391084718.0, "reward": 2.126953125, "reward_std": 0.15623098611831665, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1730.0, "completions/max_terminated_length": 1730.0, "completions/mean_length": 483.1171875, "completions/mean_terminated_length": 483.1171875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.1774604469537735, "epoch": 0.3205872311369068, "frac_reward_zero_std": 0.8125, "grad_norm": 0.10086252955433189, "learning_rate": 8.734627248839889e-07, "loss": -0.0121, "num_tokens": 391406634.0, "reward": 2.0419921875, "reward_std": 0.05844097584486008, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.022097086533904076, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 468.734375, "completions/mean_terminated_length": 456.2992248535156, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.16994578018784523, "epoch": 0.32092864458859677, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1677951414081704, "learning_rate": 8.730896458914934e-07, "loss": 0.0694, "num_tokens": 391720674.0, "reward": 2.1123046875, "reward_std": 0.171514093875885, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06960996240377426, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1390.0, "completions/max_terminated_length": 1390.0, "completions/mean_length": 477.677734375, "completions/mean_terminated_length": 477.677734375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.1671922728419304, "epoch": 0.3212700580402868, "frac_reward_zero_std": 0.625, "grad_norm": 0.11959475487821761, "learning_rate": 8.727161079843475e-07, "loss": -0.0085, "num_tokens": 392040477.0, "reward": 2.2109375, "reward_std": 0.15052062273025513, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4083731174468994, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2047.0, "completions/max_terminated_length": 1775.0, "completions/mean_length": 567.146484375, "completions/mean_terminated_length": 525.5441284179688, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.1882970854640007, "epoch": 0.3216114714919768, "frac_reward_zero_std": 0.625, "grad_norm": 0.1455280871789765, "learning_rate": 8.723421116931221e-07, "loss": 0.0088, "num_tokens": 392427848.0, "reward": 1.99755859375, "reward_std": 0.11784306168556213, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.966796875, "rewards/format_reward/std": 0.17934183776378632, "rewards/tag_count_reward/mean": 0.97607421875, "rewards/tag_count_reward/std": 0.1281694769859314, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1667.0, "completions/mean_length": 457.56640625, "completions/mean_terminated_length": 454.4540100097656, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.17029047012329102, "epoch": 0.3219528849436668, "frac_reward_zero_std": 0.625, "grad_norm": 0.15621304718428983, "learning_rate": 8.719676575490393e-07, "loss": 0.0206, "num_tokens": 392737322.0, "reward": 2.16259765625, "reward_std": 0.13334840536117554, "rewards/accuracy_reward/mean": 0.16796875, "rewards/accuracy_reward/std": 0.374204158782959, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1923.0, "completions/mean_length": 498.162109375, "completions/mean_terminated_length": 495.129150390625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.14619988575577736, "epoch": 0.3222942983953568, "frac_reward_zero_std": 0.71875, "grad_norm": 0.10181123116790922, "learning_rate": 8.715927460839717e-07, "loss": 0.0215, "num_tokens": 393068637.0, "reward": 2.14697265625, "reward_std": 0.11470264196395874, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1255.0, "completions/mean_length": 467.37109375, "completions/mean_terminated_length": 464.27789306640625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.17239126935601234, "epoch": 0.32263571184704676, "frac_reward_zero_std": 0.625, "grad_norm": 0.1390105644515243, "learning_rate": 8.712173778304414e-07, "loss": 0.0033, "num_tokens": 393382859.0, "reward": 2.14892578125, "reward_std": 0.15288038551807404, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 501.791015625, "completions/mean_terminated_length": 448.7192077636719, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.20090164989233017, "epoch": 0.3229771252987368, "frac_reward_zero_std": 0.625, "grad_norm": 0.1507794992408975, "learning_rate": 8.708415533216192e-07, "loss": 0.0426, "num_tokens": 393726480.0, "reward": 2.09130859375, "reward_std": 0.14866800606250763, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.1939331740140915, "rewards/tag_count_reward/mean": 0.97216796875, "rewards/tag_count_reward/std": 0.1401766687631607, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2047.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 527.365234375, "completions/mean_terminated_length": 484.6445617675781, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.185262318700552, "epoch": 0.32331853875042676, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14736018431129902, "learning_rate": 8.70465273091324e-07, "loss": -0.0308, "num_tokens": 394074667.0, "reward": 2.0849609375, "reward_std": 0.14291372895240784, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.966796875, "rewards/format_reward/std": 0.17934183776378632, "rewards/tag_count_reward/mean": 0.9775390625, "rewards/tag_count_reward/std": 0.12407544255256653, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1885.0, "completions/mean_length": 483.4609375, "completions/mean_terminated_length": 474.2397155761719, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.16906831040978432, "epoch": 0.32365995220211674, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12958553618021343, "learning_rate": 8.700885376740221e-07, "loss": 0.0218, "num_tokens": 394391415.0, "reward": 2.13623046875, "reward_std": 0.1360015571117401, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1302.0, "completions/max_terminated_length": 1302.0, "completions/mean_length": 457.716796875, "completions/mean_terminated_length": 457.716796875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.19401036202907562, "epoch": 0.32400136565380677, "frac_reward_zero_std": 0.71875, "grad_norm": 0.14936299744960196, "learning_rate": 8.697113476048263e-07, "loss": -0.0024, "num_tokens": 394702134.0, "reward": 2.1640625, "reward_std": 0.1102677434682846, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1454.0, "completions/max_terminated_length": 1454.0, "completions/mean_length": 462.39453125, "completions/mean_terminated_length": 462.39453125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.18883387744426727, "epoch": 0.32434277910549675, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14647175817870883, "learning_rate": 8.693337034194952e-07, "loss": -0.0049, "num_tokens": 395025152.0, "reward": 2.12109375, "reward_std": 0.15407104790210724, "rewards/accuracy_reward/mean": 0.16796875, "rewards/accuracy_reward/std": 0.374204158782959, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17416280508041382, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1928.0, "completions/mean_length": 518.12109375, "completions/mean_terminated_length": 503.0335388183594, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.15521689504384995, "epoch": 0.3246841925571868, "frac_reward_zero_std": 0.5, "grad_norm": 0.1526438783997024, "learning_rate": 8.689556056544323e-07, "loss": 0.0704, "num_tokens": 395374350.0, "reward": 2.205078125, "reward_std": 0.19998320937156677, "rewards/accuracy_reward/mean": 0.224609375, "rewards/accuracy_reward/std": 0.41773295402526855, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.07459938526153564, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1267.0, "completions/mean_length": 495.921875, "completions/mean_terminated_length": 489.8353271484375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.16613050922751427, "epoch": 0.32502560600887676, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11856579181069446, "learning_rate": 8.685770548466857e-07, "loss": 0.0168, "num_tokens": 395707798.0, "reward": 2.1572265625, "reward_std": 0.11485862731933594, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1532.0, "completions/max_terminated_length": 1532.0, "completions/mean_length": 455.314453125, "completions/mean_terminated_length": 455.314453125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.1760377250611782, "epoch": 0.32536701946056673, "frac_reward_zero_std": 0.5625, "grad_norm": 0.15393067864805207, "learning_rate": 8.681980515339463e-07, "loss": 0.0025, "num_tokens": 396020775.0, "reward": 2.13134765625, "reward_std": 0.15211772918701172, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1853.0, "completions/mean_length": 482.58984375, "completions/mean_terminated_length": 476.4510192871094, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.19030171260237694, "epoch": 0.32570843291225676, "frac_reward_zero_std": 0.625, "grad_norm": 0.13471576323245363, "learning_rate": 8.678185962545486e-07, "loss": 0.0255, "num_tokens": 396341781.0, "reward": 2.1396484375, "reward_std": 0.15798506140708923, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1311.0, "completions/max_terminated_length": 1311.0, "completions/mean_length": 479.34375, "completions/mean_terminated_length": 479.34375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.20745376497507095, "epoch": 0.32604984636394674, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1688276655889367, "learning_rate": 8.674386895474688e-07, "loss": -0.0069, "num_tokens": 396660101.0, "reward": 2.21875, "reward_std": 0.19155393540859222, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41380295157432556, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1252.0, "completions/max_terminated_length": 1252.0, "completions/mean_length": 449.166015625, "completions/mean_terminated_length": 449.166015625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.18593702092766762, "epoch": 0.3263912598156367, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1604769692623356, "learning_rate": 8.670583319523236e-07, "loss": -0.0054, "num_tokens": 396963450.0, "reward": 2.177734375, "reward_std": 0.15360495448112488, "rewards/accuracy_reward/mean": 0.177734375, "rewards/accuracy_reward/std": 0.3826628625392914, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1227.0, "completions/mean_length": 511.2109375, "completions/mean_terminated_length": 499.1102294921875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.17426764592528343, "epoch": 0.32673267326732675, "frac_reward_zero_std": 0.625, "grad_norm": 0.22012287641577263, "learning_rate": 8.666775240093711e-07, "loss": 0.0531, "num_tokens": 397302406.0, "reward": 2.13134765625, "reward_std": 0.1535947024822235, "rewards/accuracy_reward/mean": 0.15120968222618103, "rewards/accuracy_reward/std": 0.35861483216285706, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1621.0, "completions/mean_length": 484.28125, "completions/mean_terminated_length": 471.968505859375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.18675053492188454, "epoch": 0.3270740867190167, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1313387699786766, "learning_rate": 8.662962662595088e-07, "loss": 0.0514, "num_tokens": 397626086.0, "reward": 2.10546875, "reward_std": 0.14816752076148987, "rewards/accuracy_reward/mean": 0.12298387289047241, "rewards/accuracy_reward/std": 0.32875028252601624, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1368.0, "completions/mean_length": 452.6640625, "completions/mean_terminated_length": 449.5420837402344, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.16967759281396866, "epoch": 0.32741550017070675, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11116193531183433, "learning_rate": 8.659145592442727e-07, "loss": 0.0099, "num_tokens": 397930506.0, "reward": 2.1474609375, "reward_std": 0.10816384106874466, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.022097086533904076, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1579.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 448.859375, "completions/mean_terminated_length": 448.859375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.20825842022895813, "epoch": 0.32775691362239673, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1561136708174652, "learning_rate": 8.655324035058372e-07, "loss": 0.0017, "num_tokens": 398237442.0, "reward": 2.220703125, "reward_std": 0.14884379506111145, "rewards/accuracy_reward/mean": 0.220703125, "rewards/accuracy_reward/std": 0.4151262938976288, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1404.0, "completions/mean_length": 497.998046875, "completions/mean_terminated_length": 491.9196472167969, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.1576131321489811, "epoch": 0.3280983270740867, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12469250681852109, "learning_rate": 8.651497995870145e-07, "loss": 0.036, "num_tokens": 398571025.0, "reward": 2.1396484375, "reward_std": 0.14306147396564484, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1876.0, "completions/mean_length": 517.71484375, "completions/mean_terminated_length": 514.7201538085938, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.1646818369626999, "epoch": 0.32843974052577674, "frac_reward_zero_std": 0.71875, "grad_norm": 0.12882410530294358, "learning_rate": 8.647667480312525e-07, "loss": 0.0023, "num_tokens": 398918367.0, "reward": 2.05517578125, "reward_std": 0.11321146041154861, "rewards/accuracy_reward/mean": 0.060483869165182114, "rewards/accuracy_reward/std": 0.2386218160390854, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1921.0, "completions/mean_length": 491.0625, "completions/mean_terminated_length": 488.0156555175781, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.17915886268019676, "epoch": 0.3287811539774667, "frac_reward_zero_std": 0.625, "grad_norm": 0.12228318110276756, "learning_rate": 8.643832493826357e-07, "loss": 0.0186, "num_tokens": 399252383.0, "reward": 2.11767578125, "reward_std": 0.14545375108718872, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310528099536896, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1779.0, "completions/mean_length": 515.46875, "completions/mean_terminated_length": 506.4361572265625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.16016092523932457, "epoch": 0.3291225674291567, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13214194187180345, "learning_rate": 8.639993041858832e-07, "loss": 0.0324, "num_tokens": 399593679.0, "reward": 2.1611328125, "reward_std": 0.16349859535694122, "rewards/accuracy_reward/mean": 0.173828125, "rewards/accuracy_reward/std": 0.3793322443962097, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1220.0, "completions/mean_length": 502.775390625, "completions/mean_terminated_length": 496.7157287597656, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.1706901490688324, "epoch": 0.3294639808808467, "frac_reward_zero_std": 0.78125, "grad_norm": 0.09590611557115554, "learning_rate": 8.636149129863484e-07, "loss": 0.0151, "num_tokens": 399929516.0, "reward": 2.0556640625, "reward_std": 0.08943547308444977, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 465.86328125, "completions/mean_terminated_length": 462.7671203613281, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.19015223905444145, "epoch": 0.3298053943325367, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14614814238146598, "learning_rate": 8.632300763300187e-07, "loss": 0.0167, "num_tokens": 400258358.0, "reward": 2.0927734375, "reward_std": 0.16182070970535278, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1816.0, "completions/mean_length": 447.146484375, "completions/mean_terminated_length": 444.0137023925781, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.19872674345970154, "epoch": 0.3301468077842267, "frac_reward_zero_std": 0.65625, "grad_norm": 0.14782406067174333, "learning_rate": 8.628447947635135e-07, "loss": 0.0294, "num_tokens": 400565345.0, "reward": 2.076171875, "reward_std": 0.1373281031847, "rewards/accuracy_reward/mean": 0.08870967477560043, "rewards/accuracy_reward/std": 0.284611314535141, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1526.0, "completions/max_terminated_length": 1526.0, "completions/mean_length": 454.40625, "completions/mean_terminated_length": 454.40625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.18299180641770363, "epoch": 0.3304882212359167, "frac_reward_zero_std": 0.6875, "grad_norm": 0.13368961232035587, "learning_rate": 8.624590688340846e-07, "loss": -0.0038, "num_tokens": 400872145.0, "reward": 2.1796875, "reward_std": 0.1267765313386917, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.38430243730545044, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1888.0, "completions/mean_length": 451.916015625, "completions/mean_terminated_length": 439.3484191894531, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.1932714395225048, "epoch": 0.3308296346876067, "frac_reward_zero_std": 0.59375, "grad_norm": 0.17913006822418745, "learning_rate": 8.62072899089615e-07, "loss": 0.0725, "num_tokens": 401178518.0, "reward": 2.130859375, "reward_std": 0.1775195598602295, "rewards/accuracy_reward/mean": 0.14919355511665344, "rewards/accuracy_reward/std": 0.3566388487815857, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1129.0, "completions/max_terminated_length": 1129.0, "completions/mean_length": 423.724609375, "completions/mean_terminated_length": 423.724609375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.20181425660848618, "epoch": 0.3311710481392967, "frac_reward_zero_std": 0.65625, "grad_norm": 0.14225998622689007, "learning_rate": 8.616862860786177e-07, "loss": -0.0049, "num_tokens": 401476809.0, "reward": 2.15234375, "reward_std": 0.13913065195083618, "rewards/accuracy_reward/mean": 0.1572580635547638, "rewards/accuracy_reward/std": 0.36441144347190857, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2047.0, "completions/max_terminated_length": 1898.0, "completions/mean_length": 571.66015625, "completions/mean_terminated_length": 505.4203796386719, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.21358149126172066, "epoch": 0.3315124615909867, "frac_reward_zero_std": 0.5625, "grad_norm": 0.14948156321269757, "learning_rate": 8.612992303502358e-07, "loss": 0.0058, "num_tokens": 401860651.0, "reward": 2.06396484375, "reward_std": 0.169927716255188, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.2422981858253479, "rewards/tag_count_reward/mean": 0.96240234375, "rewards/tag_count_reward/std": 0.14896081387996674, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1566.0, "completions/mean_length": 500.8125, "completions/mean_terminated_length": 497.78472900390625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.16146819293498993, "epoch": 0.33185387504267666, "frac_reward_zero_std": 0.59375, "grad_norm": 0.22413418843264776, "learning_rate": 8.609117324542409e-07, "loss": 0.0251, "num_tokens": 402192123.0, "reward": 2.0986328125, "reward_std": 0.16383016109466553, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1349.0, "completions/mean_length": 475.68359375, "completions/mean_terminated_length": 472.6066589355469, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.1774294078350067, "epoch": 0.3321952884943667, "frac_reward_zero_std": 0.46875, "grad_norm": 0.17450063628783385, "learning_rate": 8.605237929410326e-07, "loss": 0.0159, "num_tokens": 402513913.0, "reward": 2.19384765625, "reward_std": 0.2055574655532837, "rewards/accuracy_reward/mean": 0.197265625, "rewards/accuracy_reward/std": 0.3983237147331238, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 474.251953125, "completions/mean_terminated_length": 471.1722106933594, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.18668507039546967, "epoch": 0.33253670194605667, "frac_reward_zero_std": 0.71875, "grad_norm": 0.12208212347664676, "learning_rate": 8.601354123616382e-07, "loss": 0.0008, "num_tokens": 402840378.0, "reward": 2.03564453125, "reward_std": 0.09638254344463348, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1825.0, "completions/mean_length": 466.505859375, "completions/mean_terminated_length": 463.41094970703125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.1819709613919258, "epoch": 0.3328781153977467, "frac_reward_zero_std": 0.625, "grad_norm": 0.16373256443500608, "learning_rate": 8.597465912677112e-07, "loss": 0.0008, "num_tokens": 403152813.0, "reward": 2.15087890625, "reward_std": 0.15777617692947388, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1316.0, "completions/mean_length": 487.748046875, "completions/mean_terminated_length": 484.6947021484375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.15929631516337395, "epoch": 0.3332195288494367, "frac_reward_zero_std": 0.625, "grad_norm": 0.1668379570181365, "learning_rate": 8.593573302115306e-07, "loss": 0.0035, "num_tokens": 403483004.0, "reward": 2.14892578125, "reward_std": 0.15525075793266296, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1682.0, "completions/mean_length": 531.4609375, "completions/mean_terminated_length": 522.5225830078125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.14461824670433998, "epoch": 0.33356094230112665, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13198952736497652, "learning_rate": 8.589676297460005e-07, "loss": 0.0199, "num_tokens": 403827320.0, "reward": 2.22607421875, "reward_std": 0.187662735581398, "rewards/accuracy_reward/mean": 0.236328125, "rewards/accuracy_reward/std": 0.42524150013923645, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1032.0, "completions/mean_length": 444.08984375, "completions/mean_terminated_length": 437.8000183105469, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.17910680174827576, "epoch": 0.3339023557528167, "frac_reward_zero_std": 0.6875, "grad_norm": 0.13680467978147495, "learning_rate": 8.585774904246495e-07, "loss": 0.0254, "num_tokens": 404134838.0, "reward": 2.0771484375, "reward_std": 0.1244475394487381, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1386.0, "completions/mean_length": 472.548828125, "completions/mean_terminated_length": 469.46575927734375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.15479229390621185, "epoch": 0.33424376920450666, "frac_reward_zero_std": 0.65625, "grad_norm": 1.3445593950199555, "learning_rate": 8.581869128016289e-07, "loss": 0.0203, "num_tokens": 404447855.0, "reward": 2.16064453125, "reward_std": 0.12947013974189758, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 483.228515625, "completions/mean_terminated_length": 480.1663513183594, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.15630831196904182, "epoch": 0.33458518265619663, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11054344862562406, "learning_rate": 8.577958974317131e-07, "loss": -0.0074, "num_tokens": 404767124.0, "reward": 2.08349609375, "reward_std": 0.10073354840278625, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1555.0, "completions/mean_length": 453.537109375, "completions/mean_terminated_length": 447.2843322753906, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.17047125846147537, "epoch": 0.33492659610788666, "frac_reward_zero_std": 0.65625, "grad_norm": 0.17377121549235136, "learning_rate": 8.57404444870298e-07, "loss": 0.0395, "num_tokens": 405073415.0, "reward": 2.1357421875, "reward_std": 0.1366589516401291, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1859.0, "completions/mean_length": 452.90234375, "completions/mean_terminated_length": 449.78082275390625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.18146878480911255, "epoch": 0.33526800955957664, "frac_reward_zero_std": 0.65625, "grad_norm": 0.14662776673874955, "learning_rate": 8.570125556734003e-07, "loss": 0.006, "num_tokens": 405383045.0, "reward": 2.11181640625, "reward_std": 0.14407400786876678, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1310.0, "completions/mean_length": 454.453125, "completions/mean_terminated_length": 448.2039489746094, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.16846051812171936, "epoch": 0.3356094230112666, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1553650885181736, "learning_rate": 8.566202303976576e-07, "loss": 0.0174, "num_tokens": 405702349.0, "reward": 2.1943359375, "reward_std": 0.19178418815135956, "rewards/accuracy_reward/mean": 0.201171875, "rewards/accuracy_reward/std": 0.4012683033943176, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1615.0, "completions/mean_length": 439.7265625, "completions/mean_terminated_length": 436.5792541503906, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.16216512769460678, "epoch": 0.33595083646295665, "frac_reward_zero_std": 0.65625, "grad_norm": 0.16159891835301066, "learning_rate": 8.562274696003261e-07, "loss": 0.0226, "num_tokens": 406004449.0, "reward": 2.19970703125, "reward_std": 0.12294073402881622, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.4027182459831238, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1354.0, "completions/max_terminated_length": 1354.0, "completions/mean_length": 452.1796875, "completions/mean_terminated_length": 452.1796875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.17328746616840363, "epoch": 0.3362922499146466, "frac_reward_zero_std": 0.65625, "grad_norm": 0.13178420019626638, "learning_rate": 8.55834273839281e-07, "loss": -0.0122, "num_tokens": 406324797.0, "reward": 2.212890625, "reward_std": 0.1346432864665985, "rewards/accuracy_reward/mean": 0.212890625, "rewards/accuracy_reward/std": 0.409751296043396, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1492.0, "completions/mean_length": 495.365234375, "completions/mean_terminated_length": 486.21417236328125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.1652621440589428, "epoch": 0.33663366336633666, "frac_reward_zero_std": 0.71875, "grad_norm": 0.12061727225555084, "learning_rate": 8.554406436730153e-07, "loss": 0.0456, "num_tokens": 406659640.0, "reward": 2.05615234375, "reward_std": 0.12060976773500443, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 453.990234375, "completions/mean_terminated_length": 447.7392272949219, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.16690585762262344, "epoch": 0.33697507681802663, "frac_reward_zero_std": 0.8125, "grad_norm": 0.10324183713678108, "learning_rate": 8.550465796606388e-07, "loss": 0.0404, "num_tokens": 406971507.0, "reward": 2.0712890625, "reward_std": 0.07735395431518555, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1325.0, "completions/mean_length": 508.796875, "completions/mean_terminated_length": 505.78472900390625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.1530686318874359, "epoch": 0.3373164902697166, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12252727767890743, "learning_rate": 8.546520823618782e-07, "loss": 0.0074, "num_tokens": 407308811.0, "reward": 2.07177734375, "reward_std": 0.12245651334524155, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05285605043172836, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 503.865234375, "completions/mean_terminated_length": 494.7642517089844, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.14473232254385948, "epoch": 0.33765790372140664, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1263605258939916, "learning_rate": 8.542571523370748e-07, "loss": 0.0318, "num_tokens": 407648310.0, "reward": 2.14501953125, "reward_std": 0.17143908143043518, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.059313252568244934, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1062.0, "completions/mean_length": 460.482421875, "completions/mean_terminated_length": 457.375732421875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.15467942133545876, "epoch": 0.3379993171730966, "frac_reward_zero_std": 0.6875, "grad_norm": 0.14537612022231647, "learning_rate": 8.538617901471849e-07, "loss": 0.0047, "num_tokens": 407966269.0, "reward": 2.0615234375, "reward_std": 0.10893556475639343, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.041276250034570694, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1983.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 453.216796875, "completions/mean_terminated_length": 453.216796875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.16135616973042488, "epoch": 0.3383407306247866, "frac_reward_zero_std": 0.625, "grad_norm": 0.13852803087300036, "learning_rate": 8.534659963537787e-07, "loss": -0.0075, "num_tokens": 408281020.0, "reward": 2.1328125, "reward_std": 0.1672247350215912, "rewards/accuracy_reward/mean": 0.13709677755832672, "rewards/accuracy_reward/std": 0.34429675340652466, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1305.0, "completions/mean_length": 461.81640625, "completions/mean_terminated_length": 455.5960998535156, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.16969454661011696, "epoch": 0.3386821440764766, "frac_reward_zero_std": 0.65625, "grad_norm": 0.15061188211878296, "learning_rate": 8.530697715190395e-07, "loss": 0.0317, "num_tokens": 408599774.0, "reward": 2.13330078125, "reward_std": 0.1424657702445984, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1522.0, "completions/mean_length": 458.25, "completions/mean_terminated_length": 455.1389465332031, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.16767336428165436, "epoch": 0.3390235575281666, "frac_reward_zero_std": 0.6875, "grad_norm": 0.14128647805864208, "learning_rate": 8.526731162057626e-07, "loss": 0.0103, "num_tokens": 408912286.0, "reward": 2.10791015625, "reward_std": 0.1335907280445099, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.045470330864191055, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1785.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 474.41015625, "completions/mean_terminated_length": 474.41015625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.16212403029203415, "epoch": 0.33936497097985663, "frac_reward_zero_std": 0.59375, "grad_norm": 0.15861004247149443, "learning_rate": 8.522760309773552e-07, "loss": -0.0165, "num_tokens": 409231728.0, "reward": 2.17138671875, "reward_std": 0.161758154630661, "rewards/accuracy_reward/mean": 0.173828125, "rewards/accuracy_reward/std": 0.3793322443962097, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1727.0, "completions/mean_length": 468.525390625, "completions/mean_terminated_length": 465.4344482421875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.1615452691912651, "epoch": 0.3397063844315466, "frac_reward_zero_std": 0.65625, "grad_norm": 0.13818031162790098, "learning_rate": 8.518785163978343e-07, "loss": 0.0131, "num_tokens": 409548429.0, "reward": 2.08251953125, "reward_std": 0.13887175917625427, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1774.0, "completions/mean_length": 491.43359375, "completions/mean_terminated_length": 488.3874816894531, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.14891327917575836, "epoch": 0.3400477978832366, "frac_reward_zero_std": 0.625, "grad_norm": 0.1383769699623334, "learning_rate": 8.514805730318278e-07, "loss": 0.0135, "num_tokens": 409874907.0, "reward": 2.171875, "reward_std": 0.13427706062793732, "rewards/accuracy_reward/mean": 0.177734375, "rewards/accuracy_reward/std": 0.3826628625392914, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1396.0, "completions/mean_length": 505.421875, "completions/mean_terminated_length": 496.330078125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.1559780389070511, "epoch": 0.3403892113349266, "frac_reward_zero_std": 0.5625, "grad_norm": 0.14504315552352737, "learning_rate": 8.510822014445718e-07, "loss": 0.0201, "num_tokens": 410212403.0, "reward": 2.19287109375, "reward_std": 0.1874156892299652, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.4027182459831238, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1550.0, "completions/mean_length": 479.041015625, "completions/mean_terminated_length": 475.97064208984375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.1535334400832653, "epoch": 0.3407306247866166, "frac_reward_zero_std": 0.8125, "grad_norm": 0.11199883242854679, "learning_rate": 8.506834022019114e-07, "loss": 0.0222, "num_tokens": 410537656.0, "reward": 2.17626953125, "reward_std": 0.07729420065879822, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.38430243730545044, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1911.0, "completions/mean_length": 456.41015625, "completions/mean_terminated_length": 453.2955017089844, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.15982425585389137, "epoch": 0.34107203823830656, "frac_reward_zero_std": 0.625, "grad_norm": 0.1451024080058195, "learning_rate": 8.502841758702982e-07, "loss": 0.0065, "num_tokens": 410843338.0, "reward": 2.14697265625, "reward_std": 0.15336677432060242, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1380.0, "completions/max_terminated_length": 1380.0, "completions/mean_length": 457.78125, "completions/mean_terminated_length": 457.78125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.14972316473722458, "epoch": 0.3414134516899966, "frac_reward_zero_std": 0.875, "grad_norm": 0.07511219260761805, "learning_rate": 8.498845230167912e-07, "loss": -0.0026, "num_tokens": 411156666.0, "reward": 2.1328125, "reward_std": 0.036034777760505676, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 545.787109375, "completions/mean_terminated_length": 515.8804931640625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.13244562596082687, "epoch": 0.34175486514168657, "frac_reward_zero_std": 0.5, "grad_norm": 0.18503900380126528, "learning_rate": 8.494844442090552e-07, "loss": 0.0052, "num_tokens": 411520333.0, "reward": 2.220703125, "reward_std": 0.21495598554611206, "rewards/accuracy_reward/mean": 0.275390625, "rewards/accuracy_reward/std": 0.44714778661727905, "rewards/format_reward/mean": 0.966796875, "rewards/format_reward/std": 0.17934183776378632, "rewards/tag_count_reward/mean": 0.978515625, "rewards/tag_count_reward/std": 0.11819476634263992, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1187.0, "completions/max_terminated_length": 1187.0, "completions/mean_length": 473.212890625, "completions/mean_terminated_length": 473.212890625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.1579752042889595, "epoch": 0.3420962785933766, "frac_reward_zero_std": 0.75, "grad_norm": 0.18273558802927187, "learning_rate": 8.490839400153594e-07, "loss": 0.0067, "num_tokens": 411844538.0, "reward": 2.130859375, "reward_std": 0.10456396639347076, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1500.0, "completions/mean_length": 511.9140625, "completions/mean_terminated_length": 505.8921813964844, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.14950444176793098, "epoch": 0.3424376920450666, "frac_reward_zero_std": 0.6875, "grad_norm": 0.14172988413333093, "learning_rate": 8.486830110045779e-07, "loss": 0.025, "num_tokens": 412186622.0, "reward": 2.076171875, "reward_std": 0.1210584044456482, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.966796875, "rewards/format_reward/std": 0.17934183776378632, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10772226005792618, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1722.0, "completions/max_terminated_length": 1722.0, "completions/mean_length": 470.619140625, "completions/mean_terminated_length": 470.619140625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.14636238664388657, "epoch": 0.34277910549675655, "frac_reward_zero_std": 0.8125, "grad_norm": 0.10857536142910165, "learning_rate": 8.482816577461879e-07, "loss": -0.009, "num_tokens": 412504427.0, "reward": 2.068359375, "reward_std": 0.0683302953839302, "rewards/accuracy_reward/mean": 0.07056451588869095, "rewards/accuracy_reward/std": 0.25635457038879395, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1442.0, "completions/max_terminated_length": 1442.0, "completions/mean_length": 465.462890625, "completions/mean_terminated_length": 465.462890625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.15024393051862717, "epoch": 0.3431205189484466, "frac_reward_zero_std": 0.625, "grad_norm": 0.1495709808346367, "learning_rate": 8.478798808102691e-07, "loss": -0.0023, "num_tokens": 412822776.0, "reward": 2.22607421875, "reward_std": 0.12516267597675323, "rewards/accuracy_reward/mean": 0.23046875, "rewards/accuracy_reward/std": 0.42154473066329956, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1739.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 543.841796875, "completions/mean_terminated_length": 543.841796875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.1259450800716877, "epoch": 0.34346193240013656, "frac_reward_zero_std": 0.53125, "grad_norm": 0.18908613990696943, "learning_rate": 8.474776807675032e-07, "loss": 0.0129, "num_tokens": 413172487.0, "reward": 2.1865234375, "reward_std": 0.189191073179245, "rewards/accuracy_reward/mean": 0.189453125, "rewards/accuracy_reward/std": 0.3922513723373413, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1660.0, "completions/mean_length": 519.908203125, "completions/mean_terminated_length": 516.9177856445312, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.14126872271299362, "epoch": 0.34380334585182654, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1643837234254693, "learning_rate": 8.470750581891728e-07, "loss": 0.021, "num_tokens": 413512168.0, "reward": 2.14306640625, "reward_std": 0.12825830280780792, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1198.0, "completions/max_terminated_length": 1198.0, "completions/mean_length": 482.322265625, "completions/mean_terminated_length": 482.322265625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.15890821442008018, "epoch": 0.34414475930351657, "frac_reward_zero_std": 0.71875, "grad_norm": 0.15300895594694583, "learning_rate": 8.466720136471607e-07, "loss": -0.0047, "num_tokens": 413838269.0, "reward": 2.07763671875, "reward_std": 0.1159873753786087, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1861.0, "completions/mean_length": 520.873046875, "completions/mean_terminated_length": 514.8843383789062, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.1590247005224228, "epoch": 0.34448617275520654, "frac_reward_zero_std": 0.625, "grad_norm": 0.16117497065589578, "learning_rate": 8.462685477139489e-07, "loss": 0.0208, "num_tokens": 414191260.0, "reward": 2.1298828125, "reward_std": 0.16738513112068176, "rewards/accuracy_reward/mean": 0.1411290317773819, "rewards/accuracy_reward/std": 0.3485061228275299, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1607.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 448.41796875, "completions/mean_terminated_length": 448.41796875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.15567228570580482, "epoch": 0.3448275862068966, "frac_reward_zero_std": 0.59375, "grad_norm": 0.16729374925476925, "learning_rate": 8.458646609626183e-07, "loss": -0.0042, "num_tokens": 414496962.0, "reward": 2.244140625, "reward_std": 0.16418945789337158, "rewards/accuracy_reward/mean": 0.244140625, "rewards/accuracy_reward/std": 0.42999663949012756, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1616.0, "completions/mean_length": 473.55859375, "completions/mean_terminated_length": 470.47747802734375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.16344084590673447, "epoch": 0.34516899965858655, "frac_reward_zero_std": 0.8125, "grad_norm": 0.12102311058210216, "learning_rate": 8.45460353966847e-07, "loss": 0.0181, "num_tokens": 414814848.0, "reward": 2.09423828125, "reward_std": 0.0742429569363594, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1738.0, "completions/mean_length": 476.142578125, "completions/mean_terminated_length": 473.0665283203125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.1629374548792839, "epoch": 0.3455104131102765, "frac_reward_zero_std": 0.6875, "grad_norm": 0.15581845519010717, "learning_rate": 8.450556273009104e-07, "loss": 0.0199, "num_tokens": 415132121.0, "reward": 2.1015625, "reward_std": 0.11520938575267792, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2047.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 543.61328125, "completions/mean_terminated_length": 501.349365234375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.15652335062623024, "epoch": 0.34585182656196656, "frac_reward_zero_std": 0.625, "grad_norm": 0.1483784571196715, "learning_rate": 8.4465048153968e-07, "loss": 0.0041, "num_tokens": 415492851.0, "reward": 2.1591796875, "reward_std": 0.14881163835525513, "rewards/accuracy_reward/mean": 0.212890625, "rewards/accuracy_reward/std": 0.409751296043396, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17416280508041382, "rewards/tag_count_reward/mean": 0.9775390625, "rewards/tag_count_reward/std": 0.1260315477848053, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1663.0, "completions/max_terminated_length": 1663.0, "completions/mean_length": 461.3515625, "completions/mean_terminated_length": 461.3515625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.1381632536649704, "epoch": 0.34619324001365653, "frac_reward_zero_std": 0.5625, "grad_norm": 0.18379602252832505, "learning_rate": 8.442449172586224e-07, "loss": 0.014, "num_tokens": 415799095.0, "reward": 2.20068359375, "reward_std": 0.18253467977046967, "rewards/accuracy_reward/mean": 0.20703125, "rewards/accuracy_reward/std": 0.40557438135147095, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 492.62109375, "completions/mean_terminated_length": 489.5773010253906, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.14901720359921455, "epoch": 0.3465346534653465, "frac_reward_zero_std": 0.84375, "grad_norm": 0.11990567518474043, "learning_rate": 8.438389350337988e-07, "loss": 0.0078, "num_tokens": 416127717.0, "reward": 2.05126953125, "reward_std": 0.06274308264255524, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1893.0, "completions/mean_length": 515.6875, "completions/mean_terminated_length": 506.6562194824219, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.12518061324954033, "epoch": 0.34687606691703654, "frac_reward_zero_std": 0.59375, "grad_norm": 0.16523651432391365, "learning_rate": 8.434325354418639e-07, "loss": 0.0173, "num_tokens": 416464933.0, "reward": 2.14990234375, "reward_std": 0.18240104615688324, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2047.0, "completions/max_terminated_length": 1918.0, "completions/mean_length": 495.31640625, "completions/mean_terminated_length": 473.80792236328125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.17164471000432968, "epoch": 0.3472174803687265, "frac_reward_zero_std": 0.75, "grad_norm": 0.15074100748110306, "learning_rate": 8.430257190600653e-07, "loss": 0.0054, "num_tokens": 416799175.0, "reward": 2.07470703125, "reward_std": 0.11578792333602905, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17416280508041382, "rewards/tag_count_reward/mean": 0.98095703125, "rewards/tag_count_reward/std": 0.10837557911872864, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1884.0, "completions/max_terminated_length": 1884.0, "completions/mean_length": 422.224609375, "completions/mean_terminated_length": 422.224609375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.17722434550523758, "epoch": 0.34755889382041655, "frac_reward_zero_std": 0.75, "grad_norm": 0.14980129057336497, "learning_rate": 8.426184864662426e-07, "loss": -0.0111, "num_tokens": 417098858.0, "reward": 2.1083984375, "reward_std": 0.10325562208890915, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.022097086533904076, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1702.0, "completions/mean_length": 499.4765625, "completions/mean_terminated_length": 496.4461669921875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.14755715802311897, "epoch": 0.3479003072721065, "frac_reward_zero_std": 0.6875, "grad_norm": 0.15144177761538474, "learning_rate": 8.422108382388268e-07, "loss": 0.0045, "num_tokens": 417425774.0, "reward": 2.06103515625, "reward_std": 0.1245550587773323, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1207.0, "completions/max_terminated_length": 1207.0, "completions/mean_length": 428.271484375, "completions/mean_terminated_length": 428.271484375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.14556795358657837, "epoch": 0.3482417207237965, "frac_reward_zero_std": 0.71875, "grad_norm": 0.16756603160043973, "learning_rate": 8.418027749568388e-07, "loss": 0.0045, "num_tokens": 417720009.0, "reward": 2.216796875, "reward_std": 0.12064876407384872, "rewards/accuracy_reward/mean": 0.216796875, "rewards/accuracy_reward/std": 0.4124660789966583, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1651.0, "completions/max_terminated_length": 1651.0, "completions/mean_length": 501.12109375, "completions/mean_terminated_length": 501.12109375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.12643762305378914, "epoch": 0.34858313417548653, "frac_reward_zero_std": 0.71875, "grad_norm": 0.12807861710263993, "learning_rate": 8.413942971998897e-07, "loss": -0.0053, "num_tokens": 418045671.0, "reward": 2.185546875, "reward_std": 0.10584832727909088, "rewards/accuracy_reward/mean": 0.185546875, "rewards/accuracy_reward/std": 0.38912075757980347, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1901.0, "completions/mean_length": 490.162109375, "completions/mean_terminated_length": 484.052978515625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.14981424435973167, "epoch": 0.3489245476271765, "frac_reward_zero_std": 0.625, "grad_norm": 0.15859564561681375, "learning_rate": 8.409854055481784e-07, "loss": 0.0184, "num_tokens": 418369402.0, "reward": 2.1259765625, "reward_std": 0.16491740942001343, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1592.0, "completions/mean_length": 506.55078125, "completions/mean_terminated_length": 497.46563720703125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.12585267424583435, "epoch": 0.3492659610788665, "frac_reward_zero_std": 0.59375, "grad_norm": 0.17057162615513424, "learning_rate": 8.405761005824927e-07, "loss": 0.0245, "num_tokens": 418704068.0, "reward": 2.14990234375, "reward_std": 0.15432196855545044, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1305.0, "completions/mean_length": 495.494140625, "completions/mean_terminated_length": 492.4559631347656, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.1397881768643856, "epoch": 0.3496073745305565, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1480870810140005, "learning_rate": 8.401663828842066e-07, "loss": 0.0049, "num_tokens": 419023265.0, "reward": 2.25439453125, "reward_std": 0.147362619638443, "rewards/accuracy_reward/mean": 0.2661290466785431, "rewards/accuracy_reward/std": 0.4423787593841553, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1366.0, "completions/mean_length": 519.3203125, "completions/mean_terminated_length": 513.3255004882812, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.13156871497631073, "epoch": 0.3499487879822465, "frac_reward_zero_std": 0.59375, "grad_norm": 0.15202184556842863, "learning_rate": 8.39756253035281e-07, "loss": 0.0245, "num_tokens": 419369861.0, "reward": 2.1865234375, "reward_std": 0.16759777069091797, "rewards/accuracy_reward/mean": 0.193359375, "rewards/accuracy_reward/std": 0.39531853795051575, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1617.0, "completions/max_terminated_length": 1617.0, "completions/mean_length": 460.580078125, "completions/mean_terminated_length": 460.580078125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.14918005466461182, "epoch": 0.3502902014339365, "frac_reward_zero_std": 0.78125, "grad_norm": 0.11595728864355229, "learning_rate": 8.393457116182619e-07, "loss": 0.0106, "num_tokens": 419681150.0, "reward": 2.134765625, "reward_std": 0.09915371984243393, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1813.0, "completions/max_terminated_length": 1813.0, "completions/mean_length": 513.25, "completions/mean_terminated_length": 513.25, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.13045495375990868, "epoch": 0.3506316148856265, "frac_reward_zero_std": 0.71875, "grad_norm": 0.15864302533660884, "learning_rate": 8.389347592162799e-07, "loss": 0.0187, "num_tokens": 420015662.0, "reward": 2.232421875, "reward_std": 0.10997485369443893, "rewards/accuracy_reward/mean": 0.232421875, "rewards/accuracy_reward/std": 0.42278963327407837, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1292.0, "completions/mean_length": 554.26171875, "completions/mean_terminated_length": 542.5, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.13588141836225986, "epoch": 0.3509730283373165, "frac_reward_zero_std": 0.5, "grad_norm": 0.15862615579517533, "learning_rate": 8.385233964130493e-07, "loss": 0.0266, "num_tokens": 420371620.0, "reward": 2.2197265625, "reward_std": 0.24267145991325378, "rewards/accuracy_reward/mean": 0.232421875, "rewards/accuracy_reward/std": 0.42278963327407837, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1724.0, "completions/mean_length": 563.8203125, "completions/mean_terminated_length": 560.9158325195312, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.1360701285302639, "epoch": 0.3513144417890065, "frac_reward_zero_std": 0.75, "grad_norm": 0.11943876356532045, "learning_rate": 8.381116237928677e-07, "loss": -0.0045, "num_tokens": 420733832.0, "reward": 2.10791015625, "reward_std": 0.09965138137340546, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.06134068965911865, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1746.0, "completions/mean_length": 535.0390625, "completions/mean_terminated_length": 526.121826171875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.12264568917453289, "epoch": 0.3516558552406965, "frac_reward_zero_std": 0.59375, "grad_norm": 0.16034831300323696, "learning_rate": 8.376994419406141e-07, "loss": 0.0151, "num_tokens": 421092156.0, "reward": 2.1640625, "reward_std": 0.1924576461315155, "rewards/accuracy_reward/mean": 0.18359375, "rewards/accuracy_reward/std": 0.3875311613082886, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 551.244140625, "completions/mean_terminated_length": 502.961669921875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.14949147030711174, "epoch": 0.35199726869238646, "frac_reward_zero_std": 0.71875, "grad_norm": 0.1302843517877378, "learning_rate": 8.372868514417496e-07, "loss": 0.0024, "num_tokens": 421452569.0, "reward": 2.046875, "reward_std": 0.0917678028345108, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17416280508041382, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13062210381031036, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1567.0, "completions/max_terminated_length": 1567.0, "completions/mean_length": 531.318359375, "completions/mean_terminated_length": 531.318359375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.13752086088061333, "epoch": 0.3523386821440765, "frac_reward_zero_std": 0.78125, "grad_norm": 0.11286160184057514, "learning_rate": 8.368738528823152e-07, "loss": -0.0115, "num_tokens": 421800316.0, "reward": 2.103515625, "reward_std": 0.094291090965271, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1422.0, "completions/mean_length": 507.189453125, "completions/mean_terminated_length": 504.1741638183594, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.15396923944354057, "epoch": 0.35268009559576646, "frac_reward_zero_std": 0.59375, "grad_norm": 0.17056716924976553, "learning_rate": 8.364604468489316e-07, "loss": 0.0187, "num_tokens": 422135933.0, "reward": 2.12939453125, "reward_std": 0.16733267903327942, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 505.55078125, "completions/mean_terminated_length": 502.53228759765625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.13704610988497734, "epoch": 0.3530215090474565, "frac_reward_zero_std": 0.65625, "grad_norm": 0.14851809345200603, "learning_rate": 8.360466339287985e-07, "loss": -0.0103, "num_tokens": 422466327.0, "reward": 2.20947265625, "reward_std": 0.15030494332313538, "rewards/accuracy_reward/mean": 0.212890625, "rewards/accuracy_reward/std": 0.409751296043396, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1342.0, "completions/mean_length": 558.271484375, "completions/mean_terminated_length": 546.5413208007812, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.12477041780948639, "epoch": 0.35336292249914647, "frac_reward_zero_std": 0.6875, "grad_norm": 0.14737167416357072, "learning_rate": 8.356324147096931e-07, "loss": 0.0544, "num_tokens": 422830690.0, "reward": 2.0693359375, "reward_std": 0.12564881145954132, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1683.0, "completions/mean_length": 502.078125, "completions/mean_terminated_length": 499.0528259277344, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.1581803821027279, "epoch": 0.35370433595083645, "frac_reward_zero_std": 0.625, "grad_norm": 0.18562751711853442, "learning_rate": 8.352177897799701e-07, "loss": 0.0097, "num_tokens": 423169306.0, "reward": 2.08642578125, "reward_std": 0.15626594424247742, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1949.0, "completions/mean_length": 577.201171875, "completions/mean_terminated_length": 574.3228759765625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.11277149803936481, "epoch": 0.3540457494025265, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1628618364615968, "learning_rate": 8.348027597285601e-07, "loss": -0.0239, "num_tokens": 423539505.0, "reward": 2.08056640625, "reward_std": 0.14019882678985596, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 655.626953125, "completions/mean_terminated_length": 650.1666870117188, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.09981233440339565, "epoch": 0.35438716285421645, "frac_reward_zero_std": 0.625, "grad_norm": 0.1540602396776042, "learning_rate": 8.343873251449699e-07, "loss": 0.0087, "num_tokens": 423946306.0, "reward": 2.1044921875, "reward_std": 0.1444317102432251, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1612.0, "completions/max_terminated_length": 1612.0, "completions/mean_length": 516.048828125, "completions/mean_terminated_length": 516.048828125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.13694704324007034, "epoch": 0.35472857630590643, "frac_reward_zero_std": 0.65625, "grad_norm": 0.14525109333051878, "learning_rate": 8.3397148661928e-07, "loss": -0.027, "num_tokens": 424283163.0, "reward": 2.15673828125, "reward_std": 0.1511204093694687, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1491.0, "completions/max_terminated_length": 1491.0, "completions/mean_length": 548.232421875, "completions/mean_terminated_length": 548.232421875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.1350406352430582, "epoch": 0.35506998975759646, "frac_reward_zero_std": 0.53125, "grad_norm": 0.16427298019948275, "learning_rate": 8.33555244742145e-07, "loss": -0.0032, "num_tokens": 424636498.0, "reward": 2.19921875, "reward_std": 0.19731301069259644, "rewards/accuracy_reward/mean": 0.19921875, "rewards/accuracy_reward/std": 0.39980348944664, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1942.0, "completions/max_terminated_length": 1942.0, "completions/mean_length": 464.08203125, "completions/mean_terminated_length": 464.08203125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.15277955308556557, "epoch": 0.35541140320928644, "frac_reward_zero_std": 0.71875, "grad_norm": 0.14855608559500771, "learning_rate": 8.331386001047927e-07, "loss": -0.0024, "num_tokens": 424956572.0, "reward": 2.1181640625, "reward_std": 0.09014645218849182, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1655.0, "completions/max_terminated_length": 1655.0, "completions/mean_length": 502.408203125, "completions/mean_terminated_length": 502.408203125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.1419469155371189, "epoch": 0.35575281666097647, "frac_reward_zero_std": 0.84375, "grad_norm": 0.09712439441787048, "learning_rate": 8.327215532990221e-07, "loss": -0.0085, "num_tokens": 425288093.0, "reward": 2.2109375, "reward_std": 0.06609338521957397, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4083731174468994, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1441.0, "completions/max_terminated_length": 1441.0, "completions/mean_length": 589.611328125, "completions/mean_terminated_length": 589.611328125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.13144463673233986, "epoch": 0.35609423011266644, "frac_reward_zero_std": 0.71875, "grad_norm": 0.12934047424296413, "learning_rate": 8.323041049172048e-07, "loss": -0.0041, "num_tokens": 425676694.0, "reward": 2.11328125, "reward_std": 0.09413032233715057, "rewards/accuracy_reward/mean": 0.11693548411130905, "rewards/accuracy_reward/std": 0.3216678202152252, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 569.9453125, "completions/mean_terminated_length": 567.0528564453125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.1293945498764515, "epoch": 0.3564356435643564, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1504697125496585, "learning_rate": 8.318862555522816e-07, "loss": 0.0036, "num_tokens": 426054218.0, "reward": 2.19873046875, "reward_std": 0.13688495755195618, "rewards/accuracy_reward/mean": 0.21169355511665344, "rewards/accuracy_reward/std": 0.40892118215560913, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.039800092577934265, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1345.0, "completions/mean_length": 529.34765625, "completions/mean_terminated_length": 520.3968505859375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.14028280600905418, "epoch": 0.35677705701604645, "frac_reward_zero_std": 0.625, "grad_norm": 0.1658119402421015, "learning_rate": 8.314680057977636e-07, "loss": 0.0294, "num_tokens": 426408988.0, "reward": 2.13232421875, "reward_std": 0.14508582651615143, "rewards/accuracy_reward/mean": 0.14717741310596466, "rewards/accuracy_reward/std": 0.354640394449234, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1888.0, "completions/mean_length": 544.193359375, "completions/mean_terminated_length": 535.330078125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.14551009982824326, "epoch": 0.3571184704677364, "frac_reward_zero_std": 0.65625, "grad_norm": 0.13858480151041022, "learning_rate": 8.3104935624773e-07, "loss": 0.0496, "num_tokens": 426765567.0, "reward": 2.11865234375, "reward_std": 0.1541864573955536, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1563.0, "completions/mean_length": 526.00390625, "completions/mean_terminated_length": 520.0353393554688, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.1552623100578785, "epoch": 0.3574598839194264, "frac_reward_zero_std": 0.46875, "grad_norm": 0.2073383294279051, "learning_rate": 8.306303074968283e-07, "loss": 0.0062, "num_tokens": 427116497.0, "reward": 2.2001953125, "reward_std": 0.20888106524944305, "rewards/accuracy_reward/mean": 0.20703125, "rewards/accuracy_reward/std": 0.40557438135147095, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1551.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 477.3203125, "completions/mean_terminated_length": 477.3203125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.17131642624735832, "epoch": 0.35780129737111643, "frac_reward_zero_std": 0.46875, "grad_norm": 0.26493077179684354, "learning_rate": 8.302108601402731e-07, "loss": 0.0188, "num_tokens": 427445269.0, "reward": 2.18505859375, "reward_std": 0.20989397168159485, "rewards/accuracy_reward/mean": 0.197265625, "rewards/accuracy_reward/std": 0.3983237147331238, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04538619518280029, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 575.83984375, "completions/mean_terminated_length": 528.3810424804688, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.151562612503767, "epoch": 0.3581427108228064, "frac_reward_zero_std": 0.53125, "grad_norm": 0.15877825150563776, "learning_rate": 8.297910147738446e-07, "loss": 0.0151, "num_tokens": 427821091.0, "reward": 2.2490234375, "reward_std": 0.1554987132549286, "rewards/accuracy_reward/mean": 0.306640625, "rewards/accuracy_reward/std": 0.4615498185157776, "rewards/format_reward/mean": 0.966796875, "rewards/format_reward/std": 0.17934183776378632, "rewards/tag_count_reward/mean": 0.9755859375, "rewards/tag_count_reward/std": 0.13230475783348083, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 565.697265625, "completions/mean_terminated_length": 562.7964477539062, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.15501310676336288, "epoch": 0.35848412427449644, "frac_reward_zero_std": 0.78125, "grad_norm": 0.14090687173775931, "learning_rate": 8.293707719938891e-07, "loss": 0.0068, "num_tokens": 428192504.0, "reward": 2.04296875, "reward_std": 0.0879368782043457, "rewards/accuracy_reward/mean": 0.05040322616696358, "rewards/accuracy_reward/std": 0.21899643540382385, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1895.0, "completions/mean_length": 542.52734375, "completions/mean_terminated_length": 536.62353515625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.13652550801634789, "epoch": 0.3588255377261864, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1494644478184965, "learning_rate": 8.289501323973167e-07, "loss": 0.008, "num_tokens": 428544806.0, "reward": 2.1943359375, "reward_std": 0.16284452378749847, "rewards/accuracy_reward/mean": 0.201171875, "rewards/accuracy_reward/std": 0.4012683033943176, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1758.0, "completions/mean_length": 549.09765625, "completions/mean_terminated_length": 543.2196655273438, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.1453014686703682, "epoch": 0.3591669511778764, "frac_reward_zero_std": 0.71875, "grad_norm": 0.13665668897660266, "learning_rate": 8.285290965816016e-07, "loss": 0.0222, "num_tokens": 428899160.0, "reward": 2.0908203125, "reward_std": 0.12136103957891464, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1631.0, "completions/mean_length": 586.228515625, "completions/mean_terminated_length": 580.49609375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.1463911198079586, "epoch": 0.3595083646295664, "frac_reward_zero_std": 0.6875, "grad_norm": 0.14149155713251918, "learning_rate": 8.281076651447806e-07, "loss": 0.0155, "num_tokens": 429279949.0, "reward": 2.11767578125, "reward_std": 0.13050079345703125, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1784.0, "completions/mean_length": 637.90625, "completions/mean_terminated_length": 606.9640502929688, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.15621589496731758, "epoch": 0.3598497780812564, "frac_reward_zero_std": 0.59375, "grad_norm": 0.19300815417981868, "learning_rate": 8.276858386854524e-07, "loss": 0.0376, "num_tokens": 429692461.0, "reward": 2.0673828125, "reward_std": 0.1523856818675995, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.1843547374010086, "rewards/tag_count_reward/mean": 0.9794921875, "rewards/tag_count_reward/std": 0.10979136824607849, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1850.0, "completions/mean_length": 553.1171875, "completions/mean_terminated_length": 547.2549438476562, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.15196191519498825, "epoch": 0.3601911915329464, "frac_reward_zero_std": 0.6875, "grad_norm": 0.13075482965358234, "learning_rate": 8.272636178027768e-07, "loss": 0.0327, "num_tokens": 430050121.0, "reward": 2.1357421875, "reward_std": 0.12693417072296143, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1680.0, "completions/max_terminated_length": 1680.0, "completions/mean_length": 590.150390625, "completions/mean_terminated_length": 590.150390625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.14980318769812584, "epoch": 0.3605326049846364, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14821066123623122, "learning_rate": 8.268410030964739e-07, "loss": -0.0046, "num_tokens": 430427542.0, "reward": 2.12109375, "reward_std": 0.14194431900978088, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1552.0, "completions/mean_length": 543.435546875, "completions/mean_terminated_length": 537.5353393554688, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.16955586895346642, "epoch": 0.3608740184363264, "frac_reward_zero_std": 0.5625, "grad_norm": 0.17430047907492782, "learning_rate": 8.264179951668234e-07, "loss": 0.0344, "num_tokens": 430776325.0, "reward": 2.1787109375, "reward_std": 0.16472187638282776, "rewards/accuracy_reward/mean": 0.185546875, "rewards/accuracy_reward/std": 0.38912075757980347, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 611.43359375, "completions/mean_terminated_length": 608.622314453125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.1348312459886074, "epoch": 0.3612154318880164, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12772813477433292, "learning_rate": 8.259945946146631e-07, "loss": 0.0055, "num_tokens": 431169635.0, "reward": 2.13916015625, "reward_std": 0.15258750319480896, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1423.0, "completions/max_terminated_length": 1423.0, "completions/mean_length": 590.849609375, "completions/mean_terminated_length": 590.849609375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.13831545040011406, "epoch": 0.3615568453397064, "frac_reward_zero_std": 0.5, "grad_norm": 0.15150089183708326, "learning_rate": 8.255708020413886e-07, "loss": 0.004, "num_tokens": 431538438.0, "reward": 2.248046875, "reward_std": 0.19299906492233276, "rewards/accuracy_reward/mean": 0.248046875, "rewards/accuracy_reward/std": 0.4323015511035919, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1921.0, "completions/mean_length": 568.583984375, "completions/mean_terminated_length": 565.6888427734375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.14974427223205566, "epoch": 0.36189825879139637, "frac_reward_zero_std": 0.5, "grad_norm": 0.19151802766114284, "learning_rate": 8.251466180489526e-07, "loss": 0.0222, "num_tokens": 431904801.0, "reward": 2.17626953125, "reward_std": 0.19293415546417236, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.38430243730545044, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 615.388671875, "completions/mean_terminated_length": 612.5851440429688, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.1321590654551983, "epoch": 0.3622396722430864, "frac_reward_zero_std": 0.625, "grad_norm": 0.12921361029047104, "learning_rate": 8.247220432398635e-07, "loss": 0.0128, "num_tokens": 432310744.0, "reward": 2.21533203125, "reward_std": 0.15940770506858826, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41380295157432556, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1691.0, "completions/mean_length": 570.271484375, "completions/mean_terminated_length": 558.6358032226562, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.13561303541064262, "epoch": 0.3625810856947764, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12493452273207935, "learning_rate": 8.242970782171847e-07, "loss": 0.026, "num_tokens": 432676515.0, "reward": 2.15283203125, "reward_std": 0.1414296180009842, "rewards/accuracy_reward/mean": 0.166015625, "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.06134068965911865, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1671.0, "completions/mean_length": 503.408203125, "completions/mean_terminated_length": 500.3855285644531, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.13994567468762398, "epoch": 0.36292249914646635, "frac_reward_zero_std": 0.53125, "grad_norm": 0.18263361921750135, "learning_rate": 8.238717235845342e-07, "loss": 0.0188, "num_tokens": 433002596.0, "reward": 2.26806640625, "reward_std": 0.20442159473896027, "rewards/accuracy_reward/mean": 0.271484375, "rewards/accuracy_reward/std": 0.44516023993492126, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1821.0, "completions/mean_length": 610.19921875, "completions/mean_terminated_length": 598.8779296875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.13554790616035461, "epoch": 0.3632639125981564, "frac_reward_zero_std": 0.625, "grad_norm": 0.15988321139456835, "learning_rate": 8.234459799460834e-07, "loss": 0.0131, "num_tokens": 433394682.0, "reward": 2.146484375, "reward_std": 0.16132193803787231, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3776407241821289, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.990234375, "rewards/tag_count_reward/std": 0.0806792601943016, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1863.0, "completions/mean_length": 595.017578125, "completions/mean_terminated_length": 592.1741943359375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.15012116730213165, "epoch": 0.36360532604984636, "frac_reward_zero_std": 0.5, "grad_norm": 0.1634908671744332, "learning_rate": 8.230198479065557e-07, "loss": -0.0013, "num_tokens": 433773459.0, "reward": 2.18505859375, "reward_std": 0.2044278085231781, "rewards/accuracy_reward/mean": 0.19140625, "rewards/accuracy_reward/std": 0.3937928080558777, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.039800092577934265, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1675.0, "completions/mean_length": 549.1171875, "completions/mean_terminated_length": 543.2392578125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.1499665416777134, "epoch": 0.3639467395015364, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1773791636625689, "learning_rate": 8.22593328071227e-07, "loss": 0.0172, "num_tokens": 434126287.0, "reward": 2.2119140625, "reward_std": 0.17005978524684906, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41380295157432556, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1862.0, "completions/mean_length": 585.697265625, "completions/mean_terminated_length": 582.8356323242188, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.15224897861480713, "epoch": 0.36428815295322636, "frac_reward_zero_std": 0.65625, "grad_norm": 0.13075515514196928, "learning_rate": 8.221664210459234e-07, "loss": 0.0083, "num_tokens": 434506372.0, "reward": 2.13134765625, "reward_std": 0.1321447342634201, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1732.0, "completions/max_terminated_length": 1732.0, "completions/mean_length": 580.830078125, "completions/mean_terminated_length": 580.830078125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.15229212120175362, "epoch": 0.36462956640491634, "frac_reward_zero_std": 0.5625, "grad_norm": 0.16019177051010333, "learning_rate": 8.217391274370209e-07, "loss": -0.0286, "num_tokens": 434880653.0, "reward": 2.244140625, "reward_std": 0.1922931671142578, "rewards/accuracy_reward/mean": 0.2520161271095276, "rewards/accuracy_reward/std": 0.4346088469028473, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1595.0, "completions/mean_length": 653.083984375, "completions/mean_terminated_length": 642.1004028320312, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.1376323439180851, "epoch": 0.36497097985660637, "frac_reward_zero_std": 0.625, "grad_norm": 0.14381830436773488, "learning_rate": 8.213114478514453e-07, "loss": 0.046, "num_tokens": 435296216.0, "reward": 2.083984375, "reward_std": 0.16855508089065552, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1695.0, "completions/mean_length": 537.583984375, "completions/mean_terminated_length": 531.6608276367188, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.16238385811448097, "epoch": 0.36531239330829635, "frac_reward_zero_std": 0.5, "grad_norm": 0.16845053440227528, "learning_rate": 8.208833828966698e-07, "loss": 0.0297, "num_tokens": 435649955.0, "reward": 2.1357421875, "reward_std": 0.18856075406074524, "rewards/accuracy_reward/mean": 0.14717741310596466, "rewards/accuracy_reward/std": 0.3546403646469116, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1952.0, "completions/mean_length": 614.095703125, "completions/mean_terminated_length": 605.6444091796875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.15553845092654228, "epoch": 0.3656538067599863, "frac_reward_zero_std": 0.5, "grad_norm": 0.14143326415048607, "learning_rate": 8.204549331807157e-07, "loss": 0.0269, "num_tokens": 436048884.0, "reward": 2.16064453125, "reward_std": 0.19948136806488037, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3776407241821289, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.06134068965911865, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1622.0, "completions/mean_length": 606.59375, "completions/mean_terminated_length": 603.7730102539062, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.14029798284173012, "epoch": 0.36599522021167635, "frac_reward_zero_std": 0.71875, "grad_norm": 0.1244793354916626, "learning_rate": 8.200260993121503e-07, "loss": 0.0142, "num_tokens": 436437188.0, "reward": 2.14892578125, "reward_std": 0.10433532297611237, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1426.0, "completions/mean_length": 556.08984375, "completions/mean_terminated_length": 553.1702270507812, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.14289464801549911, "epoch": 0.36633663366336633, "frac_reward_zero_std": 0.625, "grad_norm": 0.1591834291389743, "learning_rate": 8.195968819000867e-07, "loss": 0.0064, "num_tokens": 436802930.0, "reward": 2.17041015625, "reward_std": 0.16933627426624298, "rewards/accuracy_reward/mean": 0.22782258689403534, "rewards/accuracy_reward/std": 0.4198509752750397, "rewards/format_reward/mean": 0.966796875, "rewards/format_reward/std": 0.17934183776378632, "rewards/tag_count_reward/mean": 0.98291015625, "rewards/tag_count_reward/std": 0.09292974323034286, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1829.0, "completions/mean_length": 589.970703125, "completions/mean_terminated_length": 584.2529907226562, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.1270919293165207, "epoch": 0.36667804711505636, "frac_reward_zero_std": 0.5625, "grad_norm": 0.19580691141890633, "learning_rate": 8.191672815541827e-07, "loss": 0.0154, "num_tokens": 437176163.0, "reward": 2.2275390625, "reward_std": 0.17996101081371307, "rewards/accuracy_reward/mean": 0.24193547666072845, "rewards/accuracy_reward/std": 0.42868778109550476, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1523.0, "completions/max_terminated_length": 1523.0, "completions/mean_length": 552.720703125, "completions/mean_terminated_length": 552.720703125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.1550266556441784, "epoch": 0.36701946056674634, "frac_reward_zero_std": 0.59375, "grad_norm": 0.15769672241419772, "learning_rate": 8.187372988846406e-07, "loss": -0.0057, "num_tokens": 437536116.0, "reward": 2.2109375, "reward_std": 0.16349101066589355, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4083731174468994, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 562.765625, "completions/mean_terminated_length": 511.7899169921875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.16558696702122688, "epoch": 0.3673608740184363, "frac_reward_zero_std": 0.6875, "grad_norm": 0.15258377499409137, "learning_rate": 8.183069345022047e-07, "loss": 0.016, "num_tokens": 437902620.0, "reward": 2.1669921875, "reward_std": 0.14146780967712402, "rewards/accuracy_reward/mean": 0.228515625, "rewards/accuracy_reward/std": 0.4202871024608612, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.1843547374010086, "rewards/tag_count_reward/mean": 0.9736328125, "rewards/tag_count_reward/std": 0.13826605677604675, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1810.0, "completions/mean_length": 541.859375, "completions/mean_terminated_length": 532.9823608398438, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.14634942263364792, "epoch": 0.36770228747012634, "frac_reward_zero_std": 0.59375, "grad_norm": 0.19231673828571816, "learning_rate": 8.178761890181624e-07, "loss": 0.0123, "num_tokens": 438247556.0, "reward": 2.14306640625, "reward_std": 0.17280617356300354, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.36344730854034424, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.06134068965911865, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1928.0, "completions/mean_length": 620.5625, "completions/mean_terminated_length": 614.9647216796875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.12537699565291405, "epoch": 0.3680437009218163, "frac_reward_zero_std": 0.5625, "grad_norm": 0.18553201859042756, "learning_rate": 8.174450630443423e-07, "loss": 0.0259, "num_tokens": 438632612.0, "reward": 2.13525390625, "reward_std": 0.2002282738685608, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1440.0, "completions/mean_length": 510.447265625, "completions/mean_terminated_length": 498.3464660644531, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.14879430830478668, "epoch": 0.3683851143735063, "frac_reward_zero_std": 0.625, "grad_norm": 0.21855012117481942, "learning_rate": 8.170135571931125e-07, "loss": 0.0308, "num_tokens": 438977529.0, "reward": 2.2119140625, "reward_std": 0.13563485443592072, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.44209739565849304, "rewards/format_reward/mean": 0.966796875, "rewards/format_reward/std": 0.17934183776378632, "rewards/tag_count_reward/mean": 0.9794921875, "rewards/tag_count_reward/std": 0.11308404058218002, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1557.0, "completions/mean_length": 580.689453125, "completions/mean_terminated_length": 577.8179931640625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.12510309368371964, "epoch": 0.3687265278251963, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14949237078116, "learning_rate": 8.165816720773819e-07, "loss": 0.0151, "num_tokens": 439356426.0, "reward": 2.17626953125, "reward_std": 0.16965290904045105, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.38430243730545044, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1513.0, "completions/max_terminated_length": 1513.0, "completions/mean_length": 546.806640625, "completions/mean_terminated_length": 546.806640625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.15286070853471756, "epoch": 0.3690679412768863, "frac_reward_zero_std": 0.65625, "grad_norm": 0.16141641877985768, "learning_rate": 8.161494083105976e-07, "loss": -0.0039, "num_tokens": 439713879.0, "reward": 2.087890625, "reward_std": 0.13507547974586487, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1756.0, "completions/mean_length": 624.39453125, "completions/mean_terminated_length": 613.18505859375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.11394527554512024, "epoch": 0.36940935472857633, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1890859894602558, "learning_rate": 8.157167665067446e-07, "loss": 0.0355, "num_tokens": 440104353.0, "reward": 2.10595703125, "reward_std": 0.1734444499015808, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1602.0, "completions/mean_length": 599.685546875, "completions/mean_terminated_length": 596.8512573242188, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.12239312566816807, "epoch": 0.3697507681802663, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11864326956137156, "learning_rate": 8.152837472803445e-07, "loss": -0.0007, "num_tokens": 440490048.0, "reward": 2.10986328125, "reward_std": 0.12183638662099838, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1649.0, "completions/max_terminated_length": 1649.0, "completions/mean_length": 475.078125, "completions/mean_terminated_length": 475.078125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.13568344339728355, "epoch": 0.3700921816319563, "frac_reward_zero_std": 0.59375, "grad_norm": 0.19680449097709693, "learning_rate": 8.148503512464555e-07, "loss": 0.0016, "num_tokens": 440804728.0, "reward": 2.234375, "reward_std": 0.1654340773820877, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42402184009552, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1823.0, "completions/mean_length": 556.91796875, "completions/mean_terminated_length": 548.1296997070312, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.11587426997721195, "epoch": 0.3704335950836463, "frac_reward_zero_std": 0.71875, "grad_norm": 0.14251522476699174, "learning_rate": 8.144165790206708e-07, "loss": 0.0427, "num_tokens": 441167934.0, "reward": 2.08447265625, "reward_std": 0.12589994072914124, "rewards/accuracy_reward/mean": 0.10000000149011612, "rewards/accuracy_reward/std": 0.30031299591064453, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1875.0, "completions/max_terminated_length": 1875.0, "completions/mean_length": 558.880859375, "completions/mean_terminated_length": 558.880859375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.13356975093483925, "epoch": 0.3707750085353363, "frac_reward_zero_std": 0.71875, "grad_norm": 0.17816189353929476, "learning_rate": 8.139824312191178e-07, "loss": -0.0125, "num_tokens": 441532241.0, "reward": 2.115234375, "reward_std": 0.10504382103681564, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 587.4140625, "completions/mean_terminated_length": 584.5557861328125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.10849324986338615, "epoch": 0.37111642198702627, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1940835657737219, "learning_rate": 8.135479084584576e-07, "loss": -0.0031, "num_tokens": 441913141.0, "reward": 2.14697265625, "reward_std": 0.15975967049598694, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1582.0, "completions/mean_length": 575.482421875, "completions/mean_terminated_length": 572.6007690429688, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.11524662002921104, "epoch": 0.3714578354387163, "frac_reward_zero_std": 0.53125, "grad_norm": 0.2328806321191046, "learning_rate": 8.131130113558837e-07, "loss": -0.0005, "num_tokens": 442282956.0, "reward": 2.10986328125, "reward_std": 0.18550750613212585, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 2047.0, "completions/max_terminated_length": 1558.0, "completions/mean_length": 573.345703125, "completions/mean_terminated_length": 528.8692016601562, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.13218218833208084, "epoch": 0.3717992488904063, "frac_reward_zero_std": 0.78125, "grad_norm": 0.139106863522463, "learning_rate": 8.126777405291217e-07, "loss": -0.0014, "num_tokens": 442652717.0, "reward": 2.05322265625, "reward_std": 0.07418248802423477, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17416280508041382, "rewards/tag_count_reward/mean": 0.97705078125, "rewards/tag_count_reward/std": 0.12834827601909637, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1517.0, "completions/mean_length": 581.818359375, "completions/mean_terminated_length": 576.0686645507812, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.11141339503228664, "epoch": 0.37214066234209625, "frac_reward_zero_std": 0.625, "grad_norm": 0.2115746509279391, "learning_rate": 8.122420965964274e-07, "loss": 0.0229, "num_tokens": 443024912.0, "reward": 2.18505859375, "reward_std": 0.16177427768707275, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.4027182459831238, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.05921651050448418, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1617.0, "completions/mean_length": 545.486328125, "completions/mean_terminated_length": 539.5941772460938, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.11833148077130318, "epoch": 0.3724820757937863, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2412898312810431, "learning_rate": 8.11806080176587e-07, "loss": 0.0173, "num_tokens": 443385625.0, "reward": 2.1259765625, "reward_std": 0.12898093461990356, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1697.0, "completions/max_terminated_length": 1697.0, "completions/mean_length": 505.703125, "completions/mean_terminated_length": 505.703125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.14029542356729507, "epoch": 0.37282348924547626, "frac_reward_zero_std": 0.65625, "grad_norm": 0.27677317147030944, "learning_rate": 8.113696918889159e-07, "loss": -0.0094, "num_tokens": 443728401.0, "reward": 2.048828125, "reward_std": 0.11336804181337357, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.962890625, "rewards/format_reward/std": 0.18921469151973724, "rewards/tag_count_reward/mean": 0.982421875, "rewards/tag_count_reward/std": 0.08948424458503723, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1752.0, "completions/mean_length": 556.279296875, "completions/mean_terminated_length": 550.429443359375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.10521019995212555, "epoch": 0.3731649026971663, "frac_reward_zero_std": 0.5, "grad_norm": 0.3584354492388897, "learning_rate": 8.109329323532572e-07, "loss": 0.0247, "num_tokens": 444086640.0, "reward": 2.06494140625, "reward_std": 0.17022866010665894, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.052765581756830215, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1831.0, "completions/mean_length": 575.970703125, "completions/mean_terminated_length": 573.0900268554688, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.1033447552472353, "epoch": 0.37350631614885627, "frac_reward_zero_std": 0.65625, "grad_norm": 0.22256976645150384, "learning_rate": 8.104958021899817e-07, "loss": 0.0152, "num_tokens": 444459025.0, "reward": 2.15478515625, "reward_std": 0.12970218062400818, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1290.0, "completions/max_terminated_length": 1290.0, "completions/mean_length": 583.19140625, "completions/mean_terminated_length": 583.19140625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.1103112380951643, "epoch": 0.37384772960054624, "frac_reward_zero_std": 0.65625, "grad_norm": 0.156519281932513, "learning_rate": 8.100583020199867e-07, "loss": -0.009, "num_tokens": 444835539.0, "reward": 2.14453125, "reward_std": 0.1395888328552246, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1904.0, "completions/max_terminated_length": 1904.0, "completions/mean_length": 524.09765625, "completions/mean_terminated_length": 524.09765625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.14254438504576683, "epoch": 0.3741891430522363, "frac_reward_zero_std": 0.6875, "grad_norm": 0.18928019552742512, "learning_rate": 8.096204324646946e-07, "loss": -0.011, "num_tokens": 445181669.0, "reward": 2.107421875, "reward_std": 0.11258251965045929, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1837.0, "completions/max_terminated_length": 1837.0, "completions/mean_length": 536.958984375, "completions/mean_terminated_length": 536.958984375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.10381135530769825, "epoch": 0.37453055650392625, "frac_reward_zero_std": 0.75, "grad_norm": 0.16837532525759086, "learning_rate": 8.091821941460532e-07, "loss": -0.0124, "num_tokens": 445524064.0, "reward": 2.212890625, "reward_std": 0.08918476104736328, "rewards/accuracy_reward/mean": 0.212890625, "rewards/accuracy_reward/std": 0.409751296043396, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1947.0, "completions/max_terminated_length": 1947.0, "completions/mean_length": 581.294921875, "completions/mean_terminated_length": 581.294921875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.11712227575480938, "epoch": 0.3748719699556162, "frac_reward_zero_std": 0.625, "grad_norm": 0.15489057480226082, "learning_rate": 8.08743587686533e-07, "loss": 0.0018, "num_tokens": 445898535.0, "reward": 2.16064453125, "reward_std": 0.12920989096164703, "rewards/accuracy_reward/mean": 0.162109375, "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1916.0, "completions/mean_length": 573.416015625, "completions/mean_terminated_length": 564.7249755859375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.12684279680252075, "epoch": 0.37521338340730626, "frac_reward_zero_std": 0.625, "grad_norm": 0.172150616484023, "learning_rate": 8.083046137091285e-07, "loss": 0.0264, "num_tokens": 446273980.0, "reward": 2.14794921875, "reward_std": 0.1633254885673523, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1480.0, "completions/max_terminated_length": 1480.0, "completions/mean_length": 567.740234375, "completions/mean_terminated_length": 567.740234375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.11634081229567528, "epoch": 0.37555479685899623, "frac_reward_zero_std": 0.71875, "grad_norm": 0.15468974155138498, "learning_rate": 8.078652728373558e-07, "loss": 0.0011, "num_tokens": 446641975.0, "reward": 2.08740234375, "reward_std": 0.09564332664012909, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1767.0, "completions/mean_length": 553.46484375, "completions/mean_terminated_length": 547.6039428710938, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.12046785466372967, "epoch": 0.37589621031068626, "frac_reward_zero_std": 0.65625, "grad_norm": 0.23152114701287985, "learning_rate": 8.07425565695252e-07, "loss": 0.0183, "num_tokens": 446999909.0, "reward": 2.2353515625, "reward_std": 0.12571361660957336, "rewards/accuracy_reward/mean": 0.25833332538604736, "rewards/accuracy_reward/std": 0.43817487359046936, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1583.0, "completions/mean_length": 611.41796875, "completions/mean_terminated_length": 605.7843627929688, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.11960061453282833, "epoch": 0.37623762376237624, "frac_reward_zero_std": 0.65625, "grad_norm": 0.14192727553054257, "learning_rate": 8.069854929073746e-07, "loss": -0.0023, "num_tokens": 447393051.0, "reward": 2.1142578125, "reward_std": 0.1496717929840088, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1964.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 488.734375, "completions/mean_terminated_length": 488.734375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.14600949734449387, "epoch": 0.3765790372140662, "frac_reward_zero_std": 0.75, "grad_norm": 0.16471148121417234, "learning_rate": 8.065450550988003e-07, "loss": 0.0024, "num_tokens": 447716051.0, "reward": 2.140625, "reward_std": 0.10580214112997055, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1321.0, "completions/max_terminated_length": 1321.0, "completions/mean_length": 534.6640625, "completions/mean_terminated_length": 534.6640625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.12226795963943005, "epoch": 0.37692045066575625, "frac_reward_zero_std": 0.46875, "grad_norm": 0.23671729811562067, "learning_rate": 8.061042528951246e-07, "loss": -0.0038, "num_tokens": 448069383.0, "reward": 2.12841796875, "reward_std": 0.2084287405014038, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1432.0, "completions/mean_length": 531.548828125, "completions/mean_terminated_length": 528.5812377929688, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.12467562593519688, "epoch": 0.3772618641174462, "frac_reward_zero_std": 0.625, "grad_norm": 0.18202036496407661, "learning_rate": 8.056630869224602e-07, "loss": 0.0078, "num_tokens": 448413056.0, "reward": 2.16845703125, "reward_std": 0.16670720279216766, "rewards/accuracy_reward/mean": 0.17741934955120087, "rewards/accuracy_reward/std": 0.38240891695022583, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1553.0, "completions/max_terminated_length": 1553.0, "completions/mean_length": 516.5078125, "completions/mean_terminated_length": 516.5078125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.1230606809258461, "epoch": 0.3776032775691362, "frac_reward_zero_std": 0.6875, "grad_norm": 0.17309749423478007, "learning_rate": 8.052215578074369e-07, "loss": 0.0094, "num_tokens": 448748340.0, "reward": 2.15966796875, "reward_std": 0.1316574066877365, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1869.0, "completions/mean_length": 661.724609375, "completions/mean_terminated_length": 614.1475219726562, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.11550881341099739, "epoch": 0.37794469102082623, "frac_reward_zero_std": 0.6875, "grad_norm": 0.14727938023923776, "learning_rate": 8.047796661771999e-07, "loss": 0.0217, "num_tokens": 449175319.0, "reward": 2.06640625, "reward_std": 0.11843889951705933, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.1843547374010086, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.1258532553911209, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1614.0, "completions/mean_length": 544.02734375, "completions/mean_terminated_length": 541.0841674804688, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.12373758479952812, "epoch": 0.3782861044725162, "frac_reward_zero_std": 0.625, "grad_norm": 0.18413119010595347, "learning_rate": 8.043374126594095e-07, "loss": 0.0135, "num_tokens": 449529013.0, "reward": 2.17236328125, "reward_std": 0.13646458089351654, "rewards/accuracy_reward/mean": 0.17578125, "rewards/accuracy_reward/std": 0.3810062110424042, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1456.0, "completions/max_terminated_length": 1456.0, "completions/mean_length": 480.375, "completions/mean_terminated_length": 480.375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.13114358857274055, "epoch": 0.37862751792420624, "frac_reward_zero_std": 0.5625, "grad_norm": 0.24835554348531766, "learning_rate": 8.038947978822401e-07, "loss": -0.0014, "num_tokens": 449850869.0, "reward": 2.2734375, "reward_std": 0.18558597564697266, "rewards/accuracy_reward/mean": 0.2734375, "rewards/accuracy_reward/std": 0.4461594223976135, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2044.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 534.974609375, "completions/mean_terminated_length": 534.974609375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.120829863473773, "epoch": 0.3789689313758962, "frac_reward_zero_std": 0.65625, "grad_norm": 0.15915208006437054, "learning_rate": 8.034518224743791e-07, "loss": 0.0083, "num_tokens": 450204888.0, "reward": 2.193359375, "reward_std": 0.15019041299819946, "rewards/accuracy_reward/mean": 0.193359375, "rewards/accuracy_reward/std": 0.39531853795051575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1642.0, "completions/mean_length": 550.751953125, "completions/mean_terminated_length": 547.8218994140625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.12374963611364365, "epoch": 0.3793103448275862, "frac_reward_zero_std": 0.40625, "grad_norm": 0.23437185037680844, "learning_rate": 8.030084870650261e-07, "loss": 0.0084, "num_tokens": 450564665.0, "reward": 2.18896484375, "reward_std": 0.210462749004364, "rewards/accuracy_reward/mean": 0.19140625, "rewards/accuracy_reward/std": 0.3937928080558777, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1394.0, "completions/max_terminated_length": 1394.0, "completions/mean_length": 485.662109375, "completions/mean_terminated_length": 485.662109375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.1233822200447321, "epoch": 0.3796517582792762, "frac_reward_zero_std": 0.65625, "grad_norm": 0.19464420280481812, "learning_rate": 8.025647922838923e-07, "loss": -0.0035, "num_tokens": 450889596.0, "reward": 2.126953125, "reward_std": 0.1243416890501976, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1373.0, "completions/max_terminated_length": 1373.0, "completions/mean_length": 515.470703125, "completions/mean_terminated_length": 515.470703125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.11967535316944122, "epoch": 0.3799931717309662, "frac_reward_zero_std": 0.53125, "grad_norm": 0.22302849723963009, "learning_rate": 8.021207387611991e-07, "loss": -0.0232, "num_tokens": 451230509.0, "reward": 2.22265625, "reward_std": 0.1836431324481964, "rewards/accuracy_reward/mean": 0.22265625, "rewards/accuracy_reward/std": 0.41643625497817993, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 559.33984375, "completions/mean_terminated_length": 544.6588134765625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.10350626520812511, "epoch": 0.38033458518265617, "frac_reward_zero_std": 0.5625, "grad_norm": 0.2172680780611583, "learning_rate": 8.016763271276776e-07, "loss": 0.0571, "num_tokens": 451584139.0, "reward": 2.166015625, "reward_std": 0.1829216480255127, "rewards/accuracy_reward/mean": 0.18359375, "rewards/accuracy_reward/std": 0.3875311613082886, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.07459938526153564, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1279.0, "completions/max_terminated_length": 1279.0, "completions/mean_length": 483.9375, "completions/mean_terminated_length": 483.9375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.12512456811964512, "epoch": 0.3806759986343462, "frac_reward_zero_std": 0.71875, "grad_norm": 0.16810956921467832, "learning_rate": 8.012315580145675e-07, "loss": -0.0028, "num_tokens": 451908539.0, "reward": 2.154296875, "reward_std": 0.1287541538476944, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1890.0, "completions/max_terminated_length": 1890.0, "completions/mean_length": 487.4921875, "completions/mean_terminated_length": 487.4921875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.12347432971000671, "epoch": 0.3810174120860362, "frac_reward_zero_std": 0.75, "grad_norm": 0.16239815686230394, "learning_rate": 8.00786432053616e-07, "loss": 0.0166, "num_tokens": 452232551.0, "reward": 2.0791015625, "reward_std": 0.08356225490570068, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.022097086533904076, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 525.23046875, "completions/mean_terminated_length": 522.25048828125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.12271426059305668, "epoch": 0.3813588255377262, "frac_reward_zero_std": 0.71875, "grad_norm": 0.1896359633806866, "learning_rate": 8.003409498770777e-07, "loss": 0.0161, "num_tokens": 452589069.0, "reward": 2.06494140625, "reward_std": 0.12056058645248413, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.1843547374010086, "rewards/tag_count_reward/mean": 0.98095703125, "rewards/tag_count_reward/std": 0.10137828439474106, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1490.0, "completions/mean_length": 466.69921875, "completions/mean_terminated_length": 463.6047058105469, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.12924146465957165, "epoch": 0.3817002389894162, "frac_reward_zero_std": 0.59375, "grad_norm": 0.25890174601923094, "learning_rate": 7.998951121177129e-07, "loss": 0.0211, "num_tokens": 452901187.0, "reward": 2.16015625, "reward_std": 0.16278532147407532, "rewards/accuracy_reward/mean": 0.166015625, "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1705.0, "completions/mean_length": 508.318359375, "completions/mean_terminated_length": 505.3052673339844, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.11971444636583328, "epoch": 0.38204165244110616, "frac_reward_zero_std": 0.65625, "grad_norm": 0.216590514160424, "learning_rate": 7.994489194087868e-07, "loss": 0.0048, "num_tokens": 453235814.0, "reward": 2.13134765625, "reward_std": 0.14780272543430328, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1427.0, "completions/mean_length": 467.03515625, "completions/mean_terminated_length": 460.8353271484375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.11896458454430103, "epoch": 0.3823830658927962, "frac_reward_zero_std": 0.625, "grad_norm": 0.2592381545818847, "learning_rate": 7.990023723840689e-07, "loss": 0.0378, "num_tokens": 453551416.0, "reward": 2.2919921875, "reward_std": 0.16273824870586395, "rewards/accuracy_reward/mean": 0.298828125, "rewards/accuracy_reward/std": 0.45819199085235596, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1922.0, "completions/max_terminated_length": 1922.0, "completions/mean_length": 529.607421875, "completions/mean_terminated_length": 529.607421875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.10096987709403038, "epoch": 0.38272447934448617, "frac_reward_zero_std": 0.625, "grad_norm": 0.23970606257325366, "learning_rate": 7.985554716778323e-07, "loss": -0.002, "num_tokens": 453904351.0, "reward": 2.17626953125, "reward_std": 0.14933373034000397, "rewards/accuracy_reward/mean": 0.177734375, "rewards/accuracy_reward/std": 0.3826628625392914, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1616.0, "completions/max_terminated_length": 1616.0, "completions/mean_length": 497.388671875, "completions/mean_terminated_length": 497.388671875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.09837976470589638, "epoch": 0.38306589279617614, "frac_reward_zero_std": 0.84375, "grad_norm": 0.1288391157940668, "learning_rate": 7.981082179248519e-07, "loss": 0.0077, "num_tokens": 454230230.0, "reward": 2.15625, "reward_std": 0.061308603733778, "rewards/accuracy_reward/mean": 0.16129031777381897, "rewards/accuracy_reward/std": 0.3681698739528656, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1687.0, "completions/mean_length": 556.31640625, "completions/mean_terminated_length": 538.6284790039062, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.10127348452806473, "epoch": 0.3834073062478662, "frac_reward_zero_std": 0.71875, "grad_norm": 0.2548225851185905, "learning_rate": 7.976606117604041e-07, "loss": 0.0545, "num_tokens": 454593976.0, "reward": 2.05810546875, "reward_std": 0.12726232409477234, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.98974609375, "rewards/tag_count_reward/std": 0.08717872947454453, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1567.0, "completions/max_terminated_length": 1567.0, "completions/mean_length": 495.712890625, "completions/mean_terminated_length": 495.712890625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.10075792856514454, "epoch": 0.38374871969955615, "frac_reward_zero_std": 0.65625, "grad_norm": 0.22598587960310787, "learning_rate": 7.972126538202666e-07, "loss": -0.001, "num_tokens": 454917957.0, "reward": 2.130859375, "reward_std": 0.14014196395874023, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1644.0, "completions/max_terminated_length": 1644.0, "completions/mean_length": 475.91015625, "completions/mean_terminated_length": 475.91015625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.11021986231207848, "epoch": 0.3840901331512462, "frac_reward_zero_std": 0.53125, "grad_norm": 0.34338178623599386, "learning_rate": 7.967643447407161e-07, "loss": -0.0343, "num_tokens": 455239207.0, "reward": 2.1396484375, "reward_std": 0.17377188801765442, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.022097086533904076, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1320.0, "completions/max_terminated_length": 1320.0, "completions/mean_length": 475.787109375, "completions/mean_terminated_length": 475.787109375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.09468472190201283, "epoch": 0.38443154660293616, "frac_reward_zero_std": 0.65625, "grad_norm": 0.24923375263138725, "learning_rate": 7.963156851585279e-07, "loss": -0.0128, "num_tokens": 455552218.0, "reward": 2.140625, "reward_std": 0.1356286108493805, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1659.0, "completions/max_terminated_length": 1659.0, "completions/mean_length": 506.337890625, "completions/mean_terminated_length": 506.337890625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.10322989709675312, "epoch": 0.38477296005462613, "frac_reward_zero_std": 0.65625, "grad_norm": 0.31010418795226324, "learning_rate": 7.958666757109757e-07, "loss": -0.0145, "num_tokens": 455893351.0, "reward": 2.2001953125, "reward_std": 0.1430140733718872, "rewards/accuracy_reward/mean": 0.20703125, "rewards/accuracy_reward/std": 0.40557438135147095, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1368.0, "completions/max_terminated_length": 1368.0, "completions/mean_length": 480.390625, "completions/mean_terminated_length": 480.390625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.104818195104599, "epoch": 0.38511437350631617, "frac_reward_zero_std": 0.8125, "grad_norm": 0.2152086662921316, "learning_rate": 7.954173170358298e-07, "loss": -0.0045, "num_tokens": 456216111.0, "reward": 2.08544921875, "reward_std": 0.06923790276050568, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1622.0, "completions/mean_length": 475.87109375, "completions/mean_terminated_length": 472.7945251464844, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.11127564124763012, "epoch": 0.38545578695800614, "frac_reward_zero_std": 0.78125, "grad_norm": 0.2546905945031867, "learning_rate": 7.949676097713569e-07, "loss": 0.0029, "num_tokens": 456528893.0, "reward": 2.10595703125, "reward_std": 0.09909939765930176, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1364.0, "completions/mean_length": 485.603515625, "completions/mean_terminated_length": 479.47650146484375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.1086287647485733, "epoch": 0.3857972004096961, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3940246997569742, "learning_rate": 7.945175545563182e-07, "loss": 0.0282, "num_tokens": 456864834.0, "reward": 2.1640625, "reward_std": 0.16164018213748932, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3776407241821289, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.051725368946790695, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1468.0, "completions/mean_length": 455.1171875, "completions/mean_terminated_length": 452.0, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.11805460415780544, "epoch": 0.38613861386138615, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2744001017506884, "learning_rate": 7.940671520299697e-07, "loss": 0.0094, "num_tokens": 457174078.0, "reward": 2.1376953125, "reward_std": 0.13803710043430328, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.041276250034570694, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1335.0, "completions/mean_length": 505.771484375, "completions/mean_terminated_length": 502.75341796875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.10719721019268036, "epoch": 0.3864800273130761, "frac_reward_zero_std": 0.625, "grad_norm": 0.3016891896556309, "learning_rate": 7.936164028320608e-07, "loss": 0.009, "num_tokens": 457509177.0, "reward": 2.140625, "reward_std": 0.14249879121780396, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1387.0, "completions/mean_length": 504.333984375, "completions/mean_terminated_length": 501.3131103515625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.11276162415742874, "epoch": 0.38682144076476616, "frac_reward_zero_std": 0.625, "grad_norm": 0.3369995491721714, "learning_rate": 7.931653076028325e-07, "loss": 0.0122, "num_tokens": 457854324.0, "reward": 2.12744140625, "reward_std": 0.1273396611213684, "rewards/accuracy_reward/mean": 0.13508065044879913, "rewards/accuracy_reward/std": 0.3421548008918762, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1372.0, "completions/mean_length": 529.0078125, "completions/mean_terminated_length": 514.0276489257812, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.09877416305243969, "epoch": 0.38716285421645613, "frac_reward_zero_std": 0.71875, "grad_norm": 0.40689615714949845, "learning_rate": 7.927138669830181e-07, "loss": 0.0556, "num_tokens": 458204584.0, "reward": 2.16357421875, "reward_std": 0.1330091953277588, "rewards/accuracy_reward/mean": 0.181640625, "rewards/accuracy_reward/std": 0.38592514395713806, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99169921875, "rewards/tag_count_reward/std": 0.07696826010942459, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1558.0, "completions/max_terminated_length": 1558.0, "completions/mean_length": 493.201171875, "completions/mean_terminated_length": 493.201171875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.11833982542157173, "epoch": 0.3875042676681461, "frac_reward_zero_std": 0.6875, "grad_norm": 0.3023507900241952, "learning_rate": 7.922620816138412e-07, "loss": 0.0072, "num_tokens": 458541759.0, "reward": 2.1064453125, "reward_std": 0.12176141142845154, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1549.0, "completions/max_terminated_length": 1549.0, "completions/mean_length": 531.08984375, "completions/mean_terminated_length": 531.08984375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.09717494435608387, "epoch": 0.38784568111983614, "frac_reward_zero_std": 0.78125, "grad_norm": 0.21057139110755427, "learning_rate": 7.918099521370152e-07, "loss": -0.0005, "num_tokens": 458897229.0, "reward": 2.11328125, "reward_std": 0.08741521090269089, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1815.0, "completions/mean_length": 514.513671875, "completions/mean_terminated_length": 511.5127258300781, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.11345809511840343, "epoch": 0.3881870945715261, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2569517432274021, "learning_rate": 7.913574791947421e-07, "loss": 0.0071, "num_tokens": 459240484.0, "reward": 2.21435546875, "reward_std": 0.12757012248039246, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41380295157432556, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.039800092577934265, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1958.0, "completions/mean_length": 606.947265625, "completions/mean_terminated_length": 592.7357177734375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.07572961039841175, "epoch": 0.3885285080232161, "frac_reward_zero_std": 0.6875, "grad_norm": 0.22653049812304032, "learning_rate": 7.909046634297119e-07, "loss": 0.0223, "num_tokens": 459627177.0, "reward": 2.16064453125, "reward_std": 0.13523554801940918, "rewards/accuracy_reward/mean": 0.177734375, "rewards/accuracy_reward/std": 0.3826628625392914, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 526.26171875, "completions/mean_terminated_length": 523.2837524414062, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.10287166014313698, "epoch": 0.3888699214749061, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2620816016452582, "learning_rate": 7.904515054851012e-07, "loss": 0.0014, "num_tokens": 459967167.0, "reward": 2.08740234375, "reward_std": 0.10774172842502594, "rewards/accuracy_reward/mean": 0.0947580635547638, "rewards/accuracy_reward/std": 0.29317617416381836, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.039800092577934265, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1325.0, "completions/mean_length": 488.3671875, "completions/mean_terminated_length": 479.17486572265625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.11752278171479702, "epoch": 0.3892113349265961, "frac_reward_zero_std": 0.5625, "grad_norm": 0.42634513585641726, "learning_rate": 7.899980060045732e-07, "loss": 0.0528, "num_tokens": 460295147.0, "reward": 2.0908203125, "reward_std": 0.17262578010559082, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06960996240377426, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1287.0, "completions/mean_length": 506.9453125, "completions/mean_terminated_length": 497.86248779296875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.11020007729530334, "epoch": 0.38955274837828613, "frac_reward_zero_std": 0.6875, "grad_norm": 0.24850102462725437, "learning_rate": 7.895441656322757e-07, "loss": 0.0306, "num_tokens": 460630591.0, "reward": 2.06787109375, "reward_std": 0.11496125161647797, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1882.0, "completions/max_terminated_length": 1882.0, "completions/mean_length": 525.8203125, "completions/mean_terminated_length": 525.8203125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.10162682831287384, "epoch": 0.3898941618299761, "frac_reward_zero_std": 0.65625, "grad_norm": 0.26385960960933724, "learning_rate": 7.890899850128413e-07, "loss": 0.0093, "num_tokens": 460971427.0, "reward": 2.15185546875, "reward_std": 0.12525826692581177, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2047.0, "completions/max_terminated_length": 1665.0, "completions/mean_length": 562.67578125, "completions/mean_terminated_length": 514.7943115234375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.12187549844384193, "epoch": 0.3902355752816661, "frac_reward_zero_std": 0.59375, "grad_norm": 0.27551913963402236, "learning_rate": 7.886354647913851e-07, "loss": -0.0054, "num_tokens": 461342365.0, "reward": 2.21240234375, "reward_std": 0.16107942163944244, "rewards/accuracy_reward/mean": 0.26953125, "rewards/accuracy_reward/std": 0.44415023922920227, "rewards/format_reward/mean": 0.966796875, "rewards/format_reward/std": 0.17934183776378632, "rewards/tag_count_reward/mean": 0.97607421875, "rewards/tag_count_reward/std": 0.13100102543830872, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2047.0, "completions/max_terminated_length": 1932.0, "completions/mean_length": 576.611328125, "completions/mean_terminated_length": 541.322021484375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.12782526947557926, "epoch": 0.3905769887333561, "frac_reward_zero_std": 0.65625, "grad_norm": 0.28502220083725627, "learning_rate": 7.881806056135051e-07, "loss": -0.0102, "num_tokens": 461726966.0, "reward": 2.10205078125, "reward_std": 0.1352289319038391, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17416280508041382, "rewards/tag_count_reward/mean": 0.97900390625, "rewards/tag_count_reward/std": 0.11879855394363403, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1487.0, "completions/max_terminated_length": 1487.0, "completions/mean_length": 534.642578125, "completions/mean_terminated_length": 534.642578125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.1185589600354433, "epoch": 0.3909184021850461, "frac_reward_zero_std": 0.8125, "grad_norm": 0.18404685402879575, "learning_rate": 7.877254081252808e-07, "loss": -0.0112, "num_tokens": 462079023.0, "reward": 2.052734375, "reward_std": 0.08050891757011414, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2041.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 564.16015625, "completions/mean_terminated_length": 564.16015625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.10004252940416336, "epoch": 0.39125981563673606, "frac_reward_zero_std": 0.53125, "grad_norm": 0.3702344060639525, "learning_rate": 7.872698729732716e-07, "loss": -0.0282, "num_tokens": 462450289.0, "reward": 2.111328125, "reward_std": 0.1561925709247589, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17416280508041382, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1561.0, "completions/mean_length": 508.37109375, "completions/mean_terminated_length": 502.3333740234375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.10466768220067024, "epoch": 0.3916012290884261, "frac_reward_zero_std": 0.65625, "grad_norm": 0.28394477549113717, "learning_rate": 7.868140008045176e-07, "loss": 0.0205, "num_tokens": 462782751.0, "reward": 2.1474609375, "reward_std": 0.15007534623146057, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1903.0, "completions/mean_length": 557.89453125, "completions/mean_terminated_length": 506.75152587890625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.1070393119007349, "epoch": 0.39194264254011607, "frac_reward_zero_std": 0.625, "grad_norm": 0.26896883030635366, "learning_rate": 7.863577922665367e-07, "loss": 0.0207, "num_tokens": 463148233.0, "reward": 2.107421875, "reward_std": 0.13263991475105286, "rewards/accuracy_reward/mean": 0.166015625, "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.966796875, "rewards/format_reward/std": 0.17934183776378632, "rewards/tag_count_reward/mean": 0.974609375, "rewards/tag_count_reward/std": 0.13395914435386658, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1986.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 515.34375, "completions/mean_terminated_length": 515.34375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.10200240090489388, "epoch": 0.3922840559918061, "frac_reward_zero_std": 0.53125, "grad_norm": 0.3078395683464855, "learning_rate": 7.859012480073244e-07, "loss": -0.0126, "num_tokens": 463489481.0, "reward": 2.29296875, "reward_std": 0.18453587591648102, "rewards/accuracy_reward/mean": 0.29296875, "rewards/accuracy_reward/std": 0.455569326877594, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1060.0, "completions/max_terminated_length": 1060.0, "completions/mean_length": 475.953125, "completions/mean_terminated_length": 475.953125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.10988203808665276, "epoch": 0.3926254694434961, "frac_reward_zero_std": 0.59375, "grad_norm": 0.289671479383103, "learning_rate": 7.854443686753542e-07, "loss": 0.0094, "num_tokens": 463806065.0, "reward": 2.2001953125, "reward_std": 0.16942428052425385, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.4027182459831238, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.022097086533904076, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1521.0, "completions/max_terminated_length": 1521.0, "completions/mean_length": 545.5625, "completions/mean_terminated_length": 545.5625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.10011165589094162, "epoch": 0.39296688289518605, "frac_reward_zero_std": 0.5625, "grad_norm": 0.28684808079718266, "learning_rate": 7.849871549195745e-07, "loss": -0.0173, "num_tokens": 464156881.0, "reward": 2.2314453125, "reward_std": 0.18722933530807495, "rewards/accuracy_reward/mean": 0.24193547666072845, "rewards/accuracy_reward/std": 0.42868778109550476, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.022097086533904076, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1814.0, "completions/mean_length": 556.248046875, "completions/mean_terminated_length": 553.3287353515625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.10082178562879562, "epoch": 0.3933082963468761, "frac_reward_zero_std": 0.625, "grad_norm": 0.3610467011622888, "learning_rate": 7.845296073894092e-07, "loss": 0.0108, "num_tokens": 464525136.0, "reward": 2.12255859375, "reward_std": 0.16028624773025513, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.039800092577934265, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1839.0, "completions/mean_length": 515.673828125, "completions/mean_terminated_length": 509.66473388671875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.1034848503768444, "epoch": 0.39364970979856606, "frac_reward_zero_std": 0.65625, "grad_norm": 0.29643588138391985, "learning_rate": 7.840717267347559e-07, "loss": 0.0032, "num_tokens": 464871657.0, "reward": 2.11669921875, "reward_std": 0.1453368365764618, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1312.0, "completions/max_terminated_length": 1312.0, "completions/mean_length": 557.947265625, "completions/mean_terminated_length": 557.947265625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.09223855473101139, "epoch": 0.39399112325025604, "frac_reward_zero_std": 0.75, "grad_norm": 0.2245767473111818, "learning_rate": 7.836135136059859e-07, "loss": -0.0109, "num_tokens": 465225614.0, "reward": 2.0771484375, "reward_std": 0.08736266195774078, "rewards/accuracy_reward/mean": 0.08064515888690948, "rewards/accuracy_reward/std": 0.2725643217563629, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.022097086533904076, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1457.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 505.16796875, "completions/mean_terminated_length": 505.16796875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.10739469528198242, "epoch": 0.39433253670194607, "frac_reward_zero_std": 0.71875, "grad_norm": 0.289097792272546, "learning_rate": 7.831549686539424e-07, "loss": -0.0072, "num_tokens": 465560324.0, "reward": 2.12109375, "reward_std": 0.10214962065219879, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1284.0, "completions/mean_length": 496.69921875, "completions/mean_terminated_length": 493.66339111328125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.10508690401911736, "epoch": 0.39467395015363604, "frac_reward_zero_std": 0.65625, "grad_norm": 0.28889784443507777, "learning_rate": 7.826960925299398e-07, "loss": 0.0398, "num_tokens": 465895210.0, "reward": 2.07080078125, "reward_std": 0.1277327835559845, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.045470330864191055, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 533.0859375, "completions/mean_terminated_length": 527.1451416015625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.11476417072117329, "epoch": 0.3950153636053261, "frac_reward_zero_std": 0.71875, "grad_norm": 0.2331317624968241, "learning_rate": 7.822368858857632e-07, "loss": 0.008, "num_tokens": 466254438.0, "reward": 2.080078125, "reward_std": 0.0998808890581131, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17416280508041382, "rewards/tag_count_reward/mean": 0.982421875, "rewards/tag_count_reward/std": 0.09479400515556335, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1813.0, "completions/mean_length": 513.908203125, "completions/mean_terminated_length": 510.90606689453125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.11858974024653435, "epoch": 0.39535677705701605, "frac_reward_zero_std": 0.65625, "grad_norm": 0.3365239511684183, "learning_rate": 7.817773493736669e-07, "loss": 0.0058, "num_tokens": 466592999.0, "reward": 2.05712890625, "reward_std": 0.11512824147939682, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1766.0, "completions/mean_length": 485.294921875, "completions/mean_terminated_length": 479.16668701171875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.10319733247160912, "epoch": 0.395698190508706, "frac_reward_zero_std": 0.65625, "grad_norm": 0.4866779299508674, "learning_rate": 7.813174836463741e-07, "loss": 0.0314, "num_tokens": 466914702.0, "reward": 2.1435546875, "reward_std": 0.15564318001270294, "rewards/accuracy_reward/mean": 0.1552419364452362, "rewards/accuracy_reward/std": 0.36250078678131104, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2047.0, "completions/max_terminated_length": 1950.0, "completions/mean_length": 577.505859375, "completions/mean_terminated_length": 548.2330932617188, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.11678752303123474, "epoch": 0.39603960396039606, "frac_reward_zero_std": 0.71875, "grad_norm": 0.22723212237157497, "learning_rate": 7.808572893570753e-07, "loss": 0.0016, "num_tokens": 467293905.0, "reward": 2.126953125, "reward_std": 0.12529122829437256, "rewards/accuracy_reward/mean": 0.18346774578094482, "rewards/accuracy_reward/std": 0.3874402940273285, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17416280508041382, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.11107628047466278, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1370.0, "completions/mean_length": 482.380859375, "completions/mean_terminated_length": 479.3170166015625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.10931374691426754, "epoch": 0.39638101741208603, "frac_reward_zero_std": 0.71875, "grad_norm": 0.2947183517154306, "learning_rate": 7.803967671594277e-07, "loss": -0.0065, "num_tokens": 467612900.0, "reward": 2.14501953125, "reward_std": 0.10874119400978088, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2036.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 490.841796875, "completions/mean_terminated_length": 490.841796875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.10495237819850445, "epoch": 0.396722430863776, "frac_reward_zero_std": 0.71875, "grad_norm": 0.29526437920428006, "learning_rate": 7.799359177075546e-07, "loss": -0.0124, "num_tokens": 467946307.0, "reward": 2.15576171875, "reward_std": 0.11825722455978394, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1616.0, "completions/mean_length": 536.5, "completions/mean_terminated_length": 533.5420532226562, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.10136777721345425, "epoch": 0.39706384431546604, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2977153852996097, "learning_rate": 7.794747416560436e-07, "loss": 0.0238, "num_tokens": 468296515.0, "reward": 2.21142578125, "reward_std": 0.1267658770084381, "rewards/accuracy_reward/mean": 0.21484375, "rewards/accuracy_reward/std": 0.4111155867576599, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1251.0, "completions/max_terminated_length": 1251.0, "completions/mean_length": 462.42578125, "completions/mean_terminated_length": 462.42578125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.11534732766449451, "epoch": 0.397405257767156, "frac_reward_zero_std": 0.71875, "grad_norm": 0.24431874997604378, "learning_rate": 7.790132396599467e-07, "loss": -0.0001, "num_tokens": 468610765.0, "reward": 2.146484375, "reward_std": 0.11913130432367325, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1433.0, "completions/mean_length": 463.205078125, "completions/mean_terminated_length": 450.72637939453125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.10290912538766861, "epoch": 0.39774667121884605, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5138594834209385, "learning_rate": 7.785514123747784e-07, "loss": 0.0709, "num_tokens": 468916822.0, "reward": 2.109375, "reward_std": 0.12869372963905334, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1871.0, "completions/mean_length": 501.59375, "completions/mean_terminated_length": 498.5675048828125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.09640478156507015, "epoch": 0.398088084670536, "frac_reward_zero_std": 0.65625, "grad_norm": 0.30059250806033083, "learning_rate": 7.780892604565158e-07, "loss": 0.0143, "num_tokens": 469249526.0, "reward": 2.09619140625, "reward_std": 0.13727441430091858, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 527.572265625, "completions/mean_terminated_length": 506.4970397949219, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.10903534293174744, "epoch": 0.398429498122226, "frac_reward_zero_std": 0.75, "grad_norm": 0.2839314980132406, "learning_rate": 7.776267845615964e-07, "loss": 0.0402, "num_tokens": 469597611.0, "reward": 2.087890625, "reward_std": 0.11700011789798737, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.09310565888881683, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1373.0, "completions/mean_length": 542.96875, "completions/mean_terminated_length": 537.0667114257812, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.09377849847078323, "epoch": 0.39877091157391603, "frac_reward_zero_std": 0.59375, "grad_norm": 0.4170332488040487, "learning_rate": 7.771639853469186e-07, "loss": 0.0236, "num_tokens": 469950331.0, "reward": 2.16455078125, "reward_std": 0.16048023104667664, "rewards/accuracy_reward/mean": 0.173828125, "rewards/accuracy_reward/std": 0.3793322443962097, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1694.0, "completions/mean_length": 516.099609375, "completions/mean_terminated_length": 510.0921936035156, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.09009522385895252, "epoch": 0.399112325025606, "frac_reward_zero_std": 0.59375, "grad_norm": 0.2647035796861346, "learning_rate": 7.767008634698395e-07, "loss": 0.0236, "num_tokens": 470288622.0, "reward": 2.1318359375, "reward_std": 0.16748476028442383, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1822.0, "completions/max_terminated_length": 1822.0, "completions/mean_length": 521.365234375, "completions/mean_terminated_length": 521.365234375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.09582807682454586, "epoch": 0.399453738477296, "frac_reward_zero_std": 0.71875, "grad_norm": 0.28469745444943023, "learning_rate": 7.76237419588175e-07, "loss": 0.0014, "num_tokens": 470630217.0, "reward": 2.138671875, "reward_std": 0.13166329264640808, "rewards/accuracy_reward/mean": 0.14314515888690948, "rewards/accuracy_reward/std": 0.35057440400123596, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1084.0, "completions/max_terminated_length": 1084.0, "completions/mean_length": 475.08203125, "completions/mean_terminated_length": 475.08203125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 0.10738998465240002, "epoch": 0.399795151928986, "frac_reward_zero_std": 0.71875, "grad_norm": 0.22029326175611763, "learning_rate": 7.757736543601977e-07, "loss": -0.0212, "num_tokens": 470953987.0, "reward": 2.13916015625, "reward_std": 0.10688671469688416, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1683.0, "completions/mean_length": 480.96484375, "completions/mean_terminated_length": 477.8982238769531, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.1113879643380642, "epoch": 0.400136565380676, "frac_reward_zero_std": 0.65625, "grad_norm": 0.2867241362185466, "learning_rate": 7.753095684446373e-07, "loss": 0.0176, "num_tokens": 471283537.0, "reward": 2.11962890625, "reward_std": 0.11914129555225372, "rewards/accuracy_reward/mean": 0.1270161271095276, "rewards/accuracy_reward/std": 0.3333272337913513, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 549.37109375, "completions/mean_terminated_length": 543.494140625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 0.10262533277273178, "epoch": 0.400477978832366, "frac_reward_zero_std": 0.5625, "grad_norm": 0.30670242681142434, "learning_rate": 7.748451625006786e-07, "loss": 0.0248, "num_tokens": 471646127.0, "reward": 2.0986328125, "reward_std": 0.17531442642211914, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17416280508041382, "rewards/tag_count_reward/mean": 0.9892578125, "rewards/tag_count_reward/std": 0.06911401450634003, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1552.0, "completions/max_terminated_length": 1552.0, "completions/mean_length": 512.830078125, "completions/mean_terminated_length": 512.830078125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.10345245338976383, "epoch": 0.400819392284056, "frac_reward_zero_std": 0.5625, "grad_norm": 0.2689734157551411, "learning_rate": 7.743804371879612e-07, "loss": -0.0016, "num_tokens": 471981304.0, "reward": 2.224609375, "reward_std": 0.16373786330223083, "rewards/accuracy_reward/mean": 0.224609375, "rewards/accuracy_reward/std": 0.41773295402526855, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1870.0, "completions/max_terminated_length": 1870.0, "completions/mean_length": 541.58984375, "completions/mean_terminated_length": 541.58984375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.09226824529469013, "epoch": 0.401160805735746, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2152361657156869, "learning_rate": 7.739153931665782e-07, "loss": -0.0051, "num_tokens": 472332918.0, "reward": 2.123046875, "reward_std": 0.12351766228675842, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1554.0, "completions/mean_length": 571.28515625, "completions/mean_terminated_length": 568.3953247070312, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.0931712705641985, "epoch": 0.401502219187436, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2071388304612884, "learning_rate": 7.734500310970753e-07, "loss": 0.0177, "num_tokens": 472699624.0, "reward": 2.08837890625, "reward_std": 0.13716961443424225, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1728.0, "completions/max_terminated_length": 1728.0, "completions/mean_length": 549.12109375, "completions/mean_terminated_length": 549.12109375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.10216588526964188, "epoch": 0.401843632639126, "frac_reward_zero_std": 0.71875, "grad_norm": 0.23797910127665317, "learning_rate": 7.729843516404501e-07, "loss": 0.013, "num_tokens": 473055190.0, "reward": 2.1181640625, "reward_std": 0.10650871694087982, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.022097086533904076, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1756.0, "completions/mean_length": 530.880859375, "completions/mean_terminated_length": 527.9119262695312, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.11018801108002663, "epoch": 0.40218504609081596, "frac_reward_zero_std": 0.78125, "grad_norm": 0.2199111573034714, "learning_rate": 7.725183554581513e-07, "loss": 0.0135, "num_tokens": 473407177.0, "reward": 2.11962890625, "reward_std": 0.09449433535337448, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1784.0, "completions/mean_length": 553.701171875, "completions/mean_terminated_length": 547.8411865234375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.10414212942123413, "epoch": 0.402526459542506, "frac_reward_zero_std": 0.625, "grad_norm": 0.22409075883180313, "learning_rate": 7.720520432120768e-07, "loss": 0.0155, "num_tokens": 473766176.0, "reward": 2.1123046875, "reward_std": 0.15251141786575317, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1504.0, "completions/mean_length": 564.271484375, "completions/mean_terminated_length": 561.367919921875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.11982735805213451, "epoch": 0.40286787299419596, "frac_reward_zero_std": 0.6875, "grad_norm": 0.23173830374806736, "learning_rate": 7.715854155645739e-07, "loss": 0.0267, "num_tokens": 474133355.0, "reward": 2.0869140625, "reward_std": 0.12967242300510406, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1873.0, "completions/mean_length": 545.611328125, "completions/mean_terminated_length": 539.7196655273438, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.10585300996899605, "epoch": 0.403209286445886, "frac_reward_zero_std": 0.625, "grad_norm": 0.1882092039753598, "learning_rate": 7.711184731784378e-07, "loss": 0.0121, "num_tokens": 474481972.0, "reward": 2.1435546875, "reward_std": 0.155439555644989, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1457.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 524.8671875, "completions/mean_terminated_length": 524.8671875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.1157939713448286, "epoch": 0.40355069989757597, "frac_reward_zero_std": 0.65625, "grad_norm": 0.24050937292730698, "learning_rate": 7.70651216716911e-07, "loss": -0.0167, "num_tokens": 474827504.0, "reward": 2.130859375, "reward_std": 0.12015876173973083, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1321.0, "completions/max_terminated_length": 1321.0, "completions/mean_length": 539.794921875, "completions/mean_terminated_length": 539.794921875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.11278784461319447, "epoch": 0.40389211334926595, "frac_reward_zero_std": 0.75, "grad_norm": 0.16625003901667554, "learning_rate": 7.70183646843681e-07, "loss": -0.001, "num_tokens": 475183271.0, "reward": 2.150390625, "reward_std": 0.09052316844463348, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1731.0, "completions/max_terminated_length": 1731.0, "completions/mean_length": 556.525390625, "completions/mean_terminated_length": 556.525390625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.1204438116401434, "epoch": 0.404233526800956, "frac_reward_zero_std": 0.5, "grad_norm": 0.27535577175743786, "learning_rate": 7.697157642228826e-07, "loss": -0.0101, "num_tokens": 475547572.0, "reward": 2.18359375, "reward_std": 0.20725488662719727, "rewards/accuracy_reward/mean": 0.18359375, "rewards/accuracy_reward/std": 0.3875311613082886, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 2047.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 551.376953125, "completions/mean_terminated_length": 542.5618896484375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.1261000670492649, "epoch": 0.40457494025264595, "frac_reward_zero_std": 0.59375, "grad_norm": 0.23881324980449684, "learning_rate": 7.692475695190924e-07, "loss": 0.0083, "num_tokens": 475913477.0, "reward": 2.09765625, "reward_std": 0.14218980073928833, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.966796875, "rewards/format_reward/std": 0.17934183776378632, "rewards/tag_count_reward/mean": 0.982421875, "rewards/tag_count_reward/std": 0.09607560932636261, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1284.0, "completions/max_terminated_length": 1284.0, "completions/mean_length": 509.61328125, "completions/mean_terminated_length": 509.61328125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.12682026624679565, "epoch": 0.40491635370433593, "frac_reward_zero_std": 0.6875, "grad_norm": 0.24619828366791296, "learning_rate": 7.68779063397332e-07, "loss": -0.0156, "num_tokens": 476253391.0, "reward": 2.13720703125, "reward_std": 0.12835705280303955, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1831.0, "completions/mean_length": 505.609375, "completions/mean_terminated_length": 499.5608215332031, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.12979884445667267, "epoch": 0.40525776715602596, "frac_reward_zero_std": 0.65625, "grad_norm": 0.23440333694276982, "learning_rate": 7.683102465230648e-07, "loss": 0.0236, "num_tokens": 476591767.0, "reward": 2.1962890625, "reward_std": 0.13010463118553162, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.4027182459831238, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1278.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 523.416015625, "completions/mean_terminated_length": 523.416015625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.10816860757768154, "epoch": 0.40559918060771594, "frac_reward_zero_std": 0.65625, "grad_norm": 0.23156371401681108, "learning_rate": 7.678411195621953e-07, "loss": -0.0073, "num_tokens": 476932412.0, "reward": 2.1015625, "reward_std": 0.1495179533958435, "rewards/accuracy_reward/mean": 0.10833333432674408, "rewards/accuracy_reward/std": 0.3111251890659332, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1598.0, "completions/max_terminated_length": 1598.0, "completions/mean_length": 514.71875, "completions/mean_terminated_length": 514.71875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.11924396269023418, "epoch": 0.40594059405940597, "frac_reward_zero_std": 0.71875, "grad_norm": 0.20272706006007682, "learning_rate": 7.673716831810688e-07, "loss": -0.0048, "num_tokens": 477275420.0, "reward": 2.197265625, "reward_std": 0.1111922413110733, "rewards/accuracy_reward/mean": 0.197265625, "rewards/accuracy_reward/std": 0.3983237147331238, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1495.0, "completions/mean_length": 599.890625, "completions/mean_terminated_length": 594.2117919921875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.0971105583012104, "epoch": 0.40628200751109594, "frac_reward_zero_std": 0.53125, "grad_norm": 0.23757278213418728, "learning_rate": 7.669019380464703e-07, "loss": 0.021, "num_tokens": 477651396.0, "reward": 2.162109375, "reward_std": 0.1910194754600525, "rewards/accuracy_reward/mean": 0.18359375, "rewards/accuracy_reward/std": 0.3875311613082886, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.06589367985725403, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1626.0, "completions/max_terminated_length": 1626.0, "completions/mean_length": 544.12109375, "completions/mean_terminated_length": 544.12109375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.09901374764740467, "epoch": 0.4066234209627859, "frac_reward_zero_std": 0.6875, "grad_norm": 0.25476588050402904, "learning_rate": 7.664318848256226e-07, "loss": -0.0107, "num_tokens": 478000226.0, "reward": 2.150390625, "reward_std": 0.1279090940952301, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1679.0, "completions/max_terminated_length": 1679.0, "completions/mean_length": 579.091796875, "completions/mean_terminated_length": 579.091796875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.10234055109322071, "epoch": 0.40696483441447595, "frac_reward_zero_std": 0.5625, "grad_norm": 0.23931876848834582, "learning_rate": 7.659615241861867e-07, "loss": -0.0272, "num_tokens": 478370689.0, "reward": 2.1875, "reward_std": 0.16779756546020508, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39069411158561707, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1786.0, "completions/mean_length": 559.58203125, "completions/mean_terminated_length": 553.7451171875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.10927707701921463, "epoch": 0.4073062478661659, "frac_reward_zero_std": 0.65625, "grad_norm": 0.2291854806269047, "learning_rate": 7.654908567962601e-07, "loss": 0.0133, "num_tokens": 478730987.0, "reward": 2.15478515625, "reward_std": 0.15616291761398315, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3776407241821289, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.07382523268461227, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1284.0, "completions/max_terminated_length": 1284.0, "completions/mean_length": 498.47265625, "completions/mean_terminated_length": 498.47265625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.10806834325194359, "epoch": 0.4076476613178559, "frac_reward_zero_std": 0.6875, "grad_norm": 0.31665188738156164, "learning_rate": 7.650198833243762e-07, "loss": 0.0006, "num_tokens": 479058861.0, "reward": 2.1201171875, "reward_std": 0.12609386444091797, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1936.0, "completions/mean_length": 574.123046875, "completions/mean_terminated_length": 562.5177001953125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.10529188066720963, "epoch": 0.40798907476954593, "frac_reward_zero_std": 0.46875, "grad_norm": 0.38214452049966685, "learning_rate": 7.645486044395029e-07, "loss": 0.023, "num_tokens": 479436076.0, "reward": 2.11279296875, "reward_std": 0.23244252800941467, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.98974609375, "rewards/tag_count_reward/std": 0.08286299556493759, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1574.0, "completions/mean_length": 494.154296875, "completions/mean_terminated_length": 491.1134948730469, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.11713907122612, "epoch": 0.4083304882212359, "frac_reward_zero_std": 0.5625, "grad_norm": 0.4699184511532768, "learning_rate": 7.640770208110419e-07, "loss": 0.0109, "num_tokens": 479766283.0, "reward": 2.1826171875, "reward_std": 0.18123272061347961, "rewards/accuracy_reward/mean": 0.189453125, "rewards/accuracy_reward/std": 0.3922513723373413, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1518.0, "completions/mean_length": 603.98046875, "completions/mean_terminated_length": 601.1546020507812, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.09729328379034996, "epoch": 0.4086719016729259, "frac_reward_zero_std": 0.59375, "grad_norm": 0.2957565745829932, "learning_rate": 7.636051331088277e-07, "loss": 0.0265, "num_tokens": 480160721.0, "reward": 2.1533203125, "reward_std": 0.16576679050922394, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1264.0, "completions/max_terminated_length": 1264.0, "completions/mean_length": 531.5, "completions/mean_terminated_length": 531.5, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.09279585257172585, "epoch": 0.4090133151246159, "frac_reward_zero_std": 0.71875, "grad_norm": 0.2978928334450132, "learning_rate": 7.63132942003127e-07, "loss": -0.0217, "num_tokens": 480508513.0, "reward": 2.0751953125, "reward_std": 0.09776513278484344, "rewards/accuracy_reward/mean": 0.0786290317773819, "rewards/accuracy_reward/std": 0.26943066716194153, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.022097086533904076, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1742.0, "completions/max_terminated_length": 1742.0, "completions/mean_length": 544.771484375, "completions/mean_terminated_length": 544.771484375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.08857590705156326, "epoch": 0.4093547285763059, "frac_reward_zero_std": 0.625, "grad_norm": 0.2817582792449731, "learning_rate": 7.626604481646375e-07, "loss": -0.002, "num_tokens": 480856892.0, "reward": 2.11962890625, "reward_std": 0.1480153203010559, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1403.0, "completions/mean_length": 551.552734375, "completions/mean_terminated_length": 548.624267578125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.10622870177030563, "epoch": 0.4096961420279959, "frac_reward_zero_std": 0.625, "grad_norm": 0.3888976622672968, "learning_rate": 7.621876522644863e-07, "loss": 0.0173, "num_tokens": 481219447.0, "reward": 2.15283203125, "reward_std": 0.16660341620445251, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.36344730854034424, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1214.0, "completions/max_terminated_length": 1214.0, "completions/mean_length": 499.8046875, "completions/mean_terminated_length": 499.8046875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.09958293102681637, "epoch": 0.4100375554796859, "frac_reward_zero_std": 0.78125, "grad_norm": 0.24161256266868228, "learning_rate": 7.617145549742302e-07, "loss": -0.0021, "num_tokens": 481549523.0, "reward": 2.16650390625, "reward_std": 0.118258997797966, "rewards/accuracy_reward/mean": 0.17578125, "rewards/accuracy_reward/std": 0.3810062110424042, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.05721401423215866, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1246.0, "completions/mean_length": 535.6484375, "completions/mean_terminated_length": 532.6888427734375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.09927802719175816, "epoch": 0.4103789689313759, "frac_reward_zero_std": 0.59375, "grad_norm": 0.4142788062098508, "learning_rate": 7.612411569658539e-07, "loss": -0.021, "num_tokens": 481900063.0, "reward": 2.0712890625, "reward_std": 0.1611333191394806, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06412246823310852, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1898.0, "completions/max_terminated_length": 1898.0, "completions/mean_length": 582.357421875, "completions/mean_terminated_length": 582.357421875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.09455113299190998, "epoch": 0.4107203823830659, "frac_reward_zero_std": 0.6875, "grad_norm": 0.32835968640203606, "learning_rate": 7.607674589117691e-07, "loss": -0.0049, "num_tokens": 482279510.0, "reward": 2.0859375, "reward_std": 0.11254178732633591, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1373.0, "completions/max_terminated_length": 1373.0, "completions/mean_length": 502.3984375, "completions/mean_terminated_length": 502.3984375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.10543244332075119, "epoch": 0.4110617958347559, "frac_reward_zero_std": 0.625, "grad_norm": 0.43368644583003607, "learning_rate": 7.602934614848135e-07, "loss": -0.0043, "num_tokens": 482615378.0, "reward": 2.16455078125, "reward_std": 0.1481335461139679, "rewards/accuracy_reward/mean": 0.17578125, "rewards/accuracy_reward/std": 0.3810062110424042, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04269581660628319, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1750.0, "completions/mean_length": 452.865234375, "completions/mean_terminated_length": 449.7436218261719, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.11901674419641495, "epoch": 0.41140320928644586, "frac_reward_zero_std": 0.65625, "grad_norm": 0.5079447505365033, "learning_rate": 7.598191653582505e-07, "loss": 0.0263, "num_tokens": 482925549.0, "reward": 2.11669921875, "reward_std": 0.1332973688840866, "rewards/accuracy_reward/mean": 0.12903225421905518, "rewards/accuracy_reward/std": 0.33557409048080444, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1557.0, "completions/mean_length": 541.181640625, "completions/mean_terminated_length": 535.2725830078125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.09150746464729309, "epoch": 0.4117446227381359, "frac_reward_zero_std": 0.5625, "grad_norm": 0.42662529209761557, "learning_rate": 7.593445712057676e-07, "loss": 0.027, "num_tokens": 483279290.0, "reward": 2.1337890625, "reward_std": 0.17179518938064575, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1695.0, "completions/mean_length": 535.556640625, "completions/mean_terminated_length": 529.6255493164062, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.10966764762997627, "epoch": 0.41208603618982587, "frac_reward_zero_std": 0.625, "grad_norm": 0.5286212843816394, "learning_rate": 7.588696797014755e-07, "loss": 0.0187, "num_tokens": 483632263.0, "reward": 2.107421875, "reward_std": 0.16409426927566528, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.07294141501188278, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1667.0, "completions/mean_length": 509.587890625, "completions/mean_terminated_length": 506.5773010253906, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.1101537961512804, "epoch": 0.4124274496415159, "frac_reward_zero_std": 0.59375, "grad_norm": 0.5776501300414174, "learning_rate": 7.583944915199073e-07, "loss": 0.028, "num_tokens": 483973348.0, "reward": 2.15234375, "reward_std": 0.16262753307819366, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3776407241821289, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.06589367985725403, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1474.0, "completions/mean_length": 560.607421875, "completions/mean_terminated_length": 548.8956909179688, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.1025428120046854, "epoch": 0.4127688630932059, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6067205455202753, "learning_rate": 7.579190073360175e-07, "loss": 0.0357, "num_tokens": 484337035.0, "reward": 2.09326171875, "reward_std": 0.16535422205924988, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.06124715134501457, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1919.0, "completions/mean_length": 540.19921875, "completions/mean_terminated_length": 534.2863159179688, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 0.09743431955575943, "epoch": 0.41311027654489585, "frac_reward_zero_std": 0.5625, "grad_norm": 4.730145885970985, "learning_rate": 7.574432278251813e-07, "loss": 0.0517, "num_tokens": 484683665.0, "reward": 2.07666015625, "reward_std": 0.20439413189888, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.98486328125, "rewards/tag_count_reward/std": 0.09960971027612686, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1783.0, "completions/mean_length": 497.138671875, "completions/mean_terminated_length": 481.84417724609375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.10498242266476154, "epoch": 0.4134516899965859, "frac_reward_zero_std": 0.5, "grad_norm": 0.788584612189422, "learning_rate": 7.569671536631928e-07, "loss": 0.0607, "num_tokens": 485024808.0, "reward": 2.01611328125, "reward_std": 0.19667378067970276, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.951171875, "rewards/format_reward/std": 0.2157193273305893, "rewards/tag_count_reward/mean": 0.97314453125, "rewards/tag_count_reward/std": 0.11967206746339798, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1691.0, "completions/mean_length": 456.28125, "completions/mean_terminated_length": 443.7480163574219, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.12123570777475834, "epoch": 0.41379310344827586, "frac_reward_zero_std": 0.46875, "grad_norm": 0.8714566095590287, "learning_rate": 7.564907855262652e-07, "loss": 0.0427, "num_tokens": 485336792.0, "reward": 2.076171875, "reward_std": 0.24658828973770142, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.966796875, "rewards/format_reward/std": 0.17934183776378632, "rewards/tag_count_reward/mean": 0.978515625, "rewards/tag_count_reward/std": 0.10960409045219421, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 535.6328125, "completions/mean_terminated_length": 520.7179565429688, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.12223983742296696, "epoch": 0.41413451689996583, "frac_reward_zero_std": 0.375, "grad_norm": 1.0337592582851274, "learning_rate": 7.560141240910292e-07, "loss": 0.0681, "num_tokens": 485688716.0, "reward": 1.90966796875, "reward_std": 0.27436697483062744, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.904296875, "rewards/format_reward/std": 0.2944713830947876, "rewards/tag_count_reward/mean": 0.94482421875, "rewards/tag_count_reward/std": 0.1817440241575241, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 480.857421875, "completions/mean_terminated_length": 423.75506591796875, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 0.12891611456871033, "epoch": 0.41447593035165586, "frac_reward_zero_std": 0.15625, "grad_norm": 2.4715006605116, "learning_rate": 7.555371700345314e-07, "loss": 0.1434, "num_tokens": 486010275.0, "reward": 1.91064453125, "reward_std": 0.49442970752716064, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.87890625, "rewards/format_reward/std": 0.3265552520751953, "rewards/tag_count_reward/mean": 0.93017578125, "rewards/tag_count_reward/std": 0.20061688125133514, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.064453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1440.0, "completions/mean_length": 482.513671875, "completions/mean_terminated_length": 374.66180419921875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.15349068492650986, "epoch": 0.41481734380334584, "frac_reward_zero_std": 0.09375, "grad_norm": 3.4035765713990265, "learning_rate": 7.550599240342348e-07, "loss": 0.328, "num_tokens": 486329194.0, "reward": 1.78173828125, "reward_std": 0.6423017382621765, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.80078125, "rewards/format_reward/std": 0.39980348944664, "rewards/tag_count_reward/mean": 0.87353515625, "rewards/tag_count_reward/std": 0.25363850593566895, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1956.0, "completions/mean_length": 509.5234375, "completions/mean_terminated_length": 357.6566467285156, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.17395415157079697, "epoch": 0.41515875725503587, "frac_reward_zero_std": 0.03125, "grad_norm": 7.237387984685236, "learning_rate": 7.545823867680172e-07, "loss": 0.4286, "num_tokens": 486668598.0, "reward": 1.6328125, "reward_std": 0.6652758121490479, "rewards/accuracy_reward/mean": 0.07459677755832672, "rewards/accuracy_reward/std": 0.263004869222641, "rewards/format_reward/mean": 0.7265625, "rewards/format_reward/std": 0.4461594223976135, "rewards/tag_count_reward/mean": 0.833984375, "rewards/tag_count_reward/std": 0.27542415261268616, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 444.833984375, "completions/mean_terminated_length": 286.58154296875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 0.20696628093719482, "epoch": 0.41550017070672585, "frac_reward_zero_std": 0.0, "grad_norm": 5.817094188440207, "learning_rate": 7.54104558914169e-07, "loss": 0.3516, "num_tokens": 486970385.0, "reward": 1.64208984375, "reward_std": 0.68937087059021, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.71484375, "rewards/format_reward/std": 0.45193037390708923, "rewards/tag_count_reward/mean": 0.84130859375, "rewards/tag_count_reward/std": 0.2728548049926758, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1939.0, "completions/mean_length": 514.185546875, "completions/mean_terminated_length": 295.0692138671875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.2108100913465023, "epoch": 0.4158415841584158, "frac_reward_zero_std": 0.03125, "grad_norm": 10.680983186254593, "learning_rate": 7.536264411513948e-07, "loss": 0.4874, "num_tokens": 487318720.0, "reward": 1.55419921875, "reward_std": 0.7269125580787659, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.671875, "rewards/format_reward/std": 0.4699897766113281, "rewards/tag_count_reward/mean": 0.81591796875, "rewards/tag_count_reward/std": 0.28312888741493225, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.255859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1973.0, "completions/mean_length": 716.859375, "completions/mean_terminated_length": 259.17059326171875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.21702582016587257, "epoch": 0.41618299761010585, "frac_reward_zero_std": 0.03125, "grad_norm": 7.7874579973706295, "learning_rate": 7.531480341588101e-07, "loss": 0.5266, "num_tokens": 487771640.0, "reward": 1.2001953125, "reward_std": 0.7140411138534546, "rewards/accuracy_reward/mean": 0.032258063554763794, "rewards/accuracy_reward/std": 0.17686307430267334, "rewards/format_reward/mean": 0.45703125, "rewards/format_reward/std": 0.49863746762275696, "rewards/tag_count_reward/mean": 0.7119140625, "rewards/tag_count_reward/std": 0.3108674883842468, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.482421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1856.0, "completions/mean_length": 1103.5859375, "completions/mean_terminated_length": 223.3207550048828, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2838726341724396, "epoch": 0.41652441106179583, "frac_reward_zero_std": 0.0, "grad_norm": 6.502960410305476, "learning_rate": 7.526693386159411e-07, "loss": 0.4289, "num_tokens": 488415332.0, "reward": 0.89794921875, "reward_std": 0.6713407635688782, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12414088100194931, "rewards/format_reward/mean": 0.283203125, "rewards/format_reward/std": 0.4509948492050171, "rewards/tag_count_reward/mean": 0.59912109375, "rewards/tag_count_reward/std": 0.300951212644577, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.62109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 1371.078125, "completions/mean_terminated_length": 261.4845275878906, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 0.320064939558506, "epoch": 0.4168658245134858, "frac_reward_zero_std": 0.0, "grad_norm": 5.399548479782252, "learning_rate": 7.521903552027246e-07, "loss": 0.3568, "num_tokens": 489194364.0, "reward": 0.6650390625, "reward_std": 0.5664805173873901, "rewards/accuracy_reward/mean": 0.005859375, "rewards/accuracy_reward/std": 0.07639661431312561, "rewards/format_reward/mean": 0.16015625, "rewards/format_reward/std": 0.3671095669269562, "rewards/tag_count_reward/mean": 0.4990234375, "rewards/tag_count_reward/std": 0.26264315843582153, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.708984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1497.5546875, "completions/mean_terminated_length": 156.5369110107422, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5256960690021515, "epoch": 0.41720723796517584, "frac_reward_zero_std": 0.0, "grad_norm": 8.000388485631124, "learning_rate": 7.517110845995055e-07, "loss": 0.2561, "num_tokens": 490035896.0, "reward": 0.54541015625, "reward_std": 0.42373234033584595, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12414088100194931, "rewards/format_reward/mean": 0.083984375, "rewards/format_reward/std": 0.2776356339454651, "rewards/tag_count_reward/mean": 0.44580078125, "rewards/tag_count_reward/std": 0.23064391314983368, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.70703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1517.59375, "completions/mean_terminated_length": 237.5466766357422, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.8072696030139923, "epoch": 0.4175486514168658, "frac_reward_zero_std": 0.0, "grad_norm": 8.35471822166631, "learning_rate": 7.512315274870371e-07, "loss": 0.2296, "num_tokens": 490892584.0, "reward": 0.48974609375, "reward_std": 0.34799057245254517, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.068359375, "rewards/format_reward/std": 0.25260838866233826, "rewards/tag_count_reward/mean": 0.42138671875, "rewards/tag_count_reward/std": 0.22129009664058685, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1973.0, "completions/mean_length": 1691.26953125, "completions/mean_terminated_length": 417.2321472167969, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 1.4352602362632751, "epoch": 0.41789006486855584, "frac_reward_zero_std": 0.0, "grad_norm": 6.12852007694843, "learning_rate": 7.507516845464797e-07, "loss": 0.1016, "num_tokens": 491831314.0, "reward": 0.39208984375, "reward_std": 0.2066935896873474, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.017578125, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.37451171875, "rewards/tag_count_reward/std": 0.1741621196269989, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.642578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1571.72265625, "completions/mean_terminated_length": 715.4644775390625, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 2.0853580236434937, "epoch": 0.4182314783202458, "frac_reward_zero_std": 0.0, "grad_norm": 11.985156963680293, "learning_rate": 7.502715564593991e-07, "loss": 0.1288, "num_tokens": 492711076.0, "reward": 0.44384765625, "reward_std": 0.27972593903541565, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.048828125, "rewards/format_reward/std": 0.2157193273305893, "rewards/tag_count_reward/mean": 0.39501953125, "rewards/tag_count_reward/std": 0.20473743975162506, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.634765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1623.216796875, "completions/mean_terminated_length": 884.9572143554688, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 3.169598877429962, "epoch": 0.4185728917719358, "frac_reward_zero_std": 0.0, "grad_norm": 7.697677254499665, "learning_rate": 7.497911439077665e-07, "loss": 0.0729, "num_tokens": 493620499.0, "reward": 0.36474609375, "reward_std": 0.20729228854179382, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.01953125, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.34521484375, "rewards/tag_count_reward/std": 0.16130447387695312, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.548828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1568.513671875, "completions/mean_terminated_length": 985.242431640625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 3.7875962257385254, "epoch": 0.4189143052236258, "frac_reward_zero_std": 0.03125, "grad_norm": 7.688892133024398, "learning_rate": 7.493104475739574e-07, "loss": 0.0943, "num_tokens": 494500938.0, "reward": 0.33203125, "reward_std": 0.1577494740486145, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.009765625, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.322265625, "rewards/tag_count_reward/std": 0.15687862038612366, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.61328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1587.58984375, "completions/mean_terminated_length": 857.4444580078125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.698782205581665, "epoch": 0.4192557186753158, "frac_reward_zero_std": 0.0625, "grad_norm": 16.25644199770594, "learning_rate": 7.488294681407498e-07, "loss": 0.079, "num_tokens": 495395992.0, "reward": 0.35986328125, "reward_std": 0.1925991028547287, "rewards/accuracy_reward/mean": 0.021484375, "rewards/accuracy_reward/std": 0.14513419568538666, "rewards/format_reward/mean": 0.005859375, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.33251953125, "rewards/tag_count_reward/std": 0.1667054444551468, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.69140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1922.0, "completions/mean_length": 1597.3359375, "completions/mean_terminated_length": 587.6202392578125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 3.25045508146286, "epoch": 0.4195971321270058, "frac_reward_zero_std": 0.0, "grad_norm": 36.11402682941318, "learning_rate": 7.48348206291324e-07, "loss": 0.1059, "num_tokens": 496293364.0, "reward": 0.34423828125, "reward_std": 0.20534592866897583, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.10772226005792618, "rewards/format_reward/mean": 0.009765625, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.32275390625, "rewards/tag_count_reward/std": 0.18092724680900574, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1520.759765625, "completions/mean_terminated_length": 441.1726379394531, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 1.3024340867996216, "epoch": 0.4199385455786958, "frac_reward_zero_std": 0.0, "grad_norm": 33.29398049183131, "learning_rate": 7.478666627092618e-07, "loss": 0.1685, "num_tokens": 497153097.0, "reward": 0.3623046875, "reward_std": 0.27567747235298157, "rewards/accuracy_reward/mean": 0.021484375, "rewards/accuracy_reward/std": 0.14513419568538666, "rewards/format_reward/mean": 0.01171875, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.3291015625, "rewards/tag_count_reward/std": 0.21408973634243011, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.60546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1917.0, "completions/mean_length": 1443.33203125, "completions/mean_terminated_length": 515.376220703125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 0.5434005931019783, "epoch": 0.4202799590303858, "frac_reward_zero_std": 0.0, "grad_norm": 10.677645289724607, "learning_rate": 7.473848380785448e-07, "loss": 0.1934, "num_tokens": 497968115.0, "reward": 0.32958984375, "reward_std": 0.28829145431518555, "rewards/accuracy_reward/mean": 0.009765625, "rewards/accuracy_reward/std": 0.09843364357948303, "rewards/format_reward/mean": 0.013671875, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.30615234375, "rewards/tag_count_reward/std": 0.2470891922712326, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.560546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1384.42578125, "completions/mean_terminated_length": 538.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 0.3176286146044731, "epoch": 0.4206213724820758, "frac_reward_zero_std": 0.0, "grad_norm": 5.189888382007126, "learning_rate": 7.469027330835536e-07, "loss": 0.1952, "num_tokens": 498767165.0, "reward": 0.2744140625, "reward_std": 0.3157193958759308, "rewards/accuracy_reward/mean": 0.012096773833036423, "rewards/accuracy_reward/std": 0.10942844301462173, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.2470703125, "rewards/tag_count_reward/std": 0.2516894042491913, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.568359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1959.0, "completions/mean_length": 1459.171875, "completions/mean_terminated_length": 683.837158203125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.24813542142510414, "epoch": 0.4209627859337658, "frac_reward_zero_std": 0.0, "grad_norm": 4.672946936424408, "learning_rate": 7.464203484090679e-07, "loss": 0.1649, "num_tokens": 499590645.0, "reward": 0.20458984375, "reward_std": 0.27056124806404114, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08812850713729858, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.18896484375, "rewards/tag_count_reward/std": 0.24292393028736115, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.623046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1526.65234375, "completions/mean_terminated_length": 664.9429931640625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.2962442934513092, "epoch": 0.42130419938545577, "frac_reward_zero_std": 0.0, "grad_norm": 9.72538293346304, "learning_rate": 7.459376847402637e-07, "loss": 0.1501, "num_tokens": 500445923.0, "reward": 0.232421875, "reward_std": 0.26603639125823975, "rewards/accuracy_reward/mean": 0.017578125, "rewards/accuracy_reward/std": 0.13154059648513794, "rewards/format_reward/mean": 0.005859375, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.208984375, "rewards/tag_count_reward/std": 0.23673690855503082, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.634765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1981.0, "completions/mean_length": 1550.375, "completions/mean_terminated_length": 685.5187377929688, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.34685809165239334, "epoch": 0.4216456128371458, "frac_reward_zero_std": 0.0, "grad_norm": 11.604601051273672, "learning_rate": 7.454547427627136e-07, "loss": 0.1417, "num_tokens": 501323923.0, "reward": 0.2548828125, "reward_std": 0.29777583479881287, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.1385180652141571, "rewards/format_reward/mean": 0.021484375, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.2138671875, "rewards/tag_count_reward/std": 0.24262726306915283, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.646484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1604.640625, "completions/mean_terminated_length": 793.8563842773438, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.3399280682206154, "epoch": 0.4219870262888358, "frac_reward_zero_std": 0.0, "grad_norm": 6.128102257034383, "learning_rate": 7.449715231623857e-07, "loss": 0.119, "num_tokens": 502239387.0, "reward": 0.19287109375, "reward_std": 0.24031245708465576, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12414088100194931, "rewards/format_reward/mean": 0.005859375, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.17138671875, "rewards/tag_count_reward/std": 0.20343582332134247, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.736328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1710.23046875, "completions/mean_terminated_length": 766.977783203125, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.37471941113471985, "epoch": 0.42232843974052575, "frac_reward_zero_std": 0.0, "grad_norm": 8.24076425034655, "learning_rate": 7.444880266256425e-07, "loss": 0.1011, "num_tokens": 503192001.0, "reward": 0.19091796875, "reward_std": 0.22519509494304657, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.10772226005792618, "rewards/format_reward/mean": 0.005859375, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.17333984375, "rewards/tag_count_reward/std": 0.2029796987771988, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1717.953125, "completions/mean_terminated_length": 805.4705810546875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 0.46489106863737106, "epoch": 0.4226698531922158, "frac_reward_zero_std": 0.0, "grad_norm": 19.41469268572275, "learning_rate": 7.440042538392393e-07, "loss": 0.1102, "num_tokens": 504150169.0, "reward": 0.1708984375, "reward_std": 0.23276157677173615, "rewards/accuracy_reward/mean": 0.021484375, "rewards/accuracy_reward/std": 0.14513419568538666, "rewards/format_reward/mean": 0.001953125, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.1474609375, "rewards/tag_count_reward/std": 0.19717524945735931, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1793.5390625, "completions/mean_terminated_length": 924.862060546875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.5292958617210388, "epoch": 0.42301126664390576, "frac_reward_zero_std": 0.0, "grad_norm": 27.645299628385228, "learning_rate": 7.435202054903244e-07, "loss": 0.0665, "num_tokens": 505144589.0, "reward": 0.11376953125, "reward_std": 0.18638277053833008, "rewards/accuracy_reward/mean": 0.02016128972172737, "rewards/accuracy_reward/std": 0.14069372415542603, "rewards/format_reward/mean": 0.001953125, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.09228515625, "rewards/tag_count_reward/std": 0.1430177092552185, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.775390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1785.45703125, "completions/mean_terminated_length": 879.1129760742188, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.6626039743423462, "epoch": 0.4233526800955958, "frac_reward_zero_std": 0.0, "grad_norm": 23.174089517954055, "learning_rate": 7.430358822664371e-07, "loss": 0.0838, "num_tokens": 506138551.0, "reward": 0.0888671875, "reward_std": 0.14223650097846985, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08812850713729858, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0810546875, "rewards/tag_count_reward/std": 0.12521004676818848, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.748046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1773.05859375, "completions/mean_terminated_length": 956.7597045898438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 0.8541796207427979, "epoch": 0.42369409354728577, "frac_reward_zero_std": 0.0, "grad_norm": 40.83013073679714, "learning_rate": 7.425512848555073e-07, "loss": 0.034, "num_tokens": 507122789.0, "reward": 0.06689453125, "reward_std": 0.12366434186697006, "rewards/accuracy_reward/mean": 0.005859375, "rewards/accuracy_reward/std": 0.07639661431312561, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.06103515625, "rewards/tag_count_reward/std": 0.11624550074338913, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1811.443359375, "completions/mean_terminated_length": 836.8299560546875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 1.137829065322876, "epoch": 0.42403550699897574, "frac_reward_zero_std": 0.03125, "grad_norm": 40.732853074833145, "learning_rate": 7.420664139458546e-07, "loss": 0.0026, "num_tokens": 508137544.0, "reward": 0.072265625, "reward_std": 0.11585834622383118, "rewards/accuracy_reward/mean": 0.012096773833036423, "rewards/accuracy_reward/std": 0.10942844301462173, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.060546875, "rewards/tag_count_reward/std": 0.10946451127529144, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.880859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1899.83203125, "completions/mean_terminated_length": 804.360595703125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 2.0902875661849976, "epoch": 0.4243769204506658, "frac_reward_zero_std": 0.125, "grad_norm": 56.32534037999387, "learning_rate": 7.415812702261864e-07, "loss": 0.0322, "num_tokens": 509187026.0, "reward": 0.041015625, "reward_std": 0.08995534479618073, "rewards/accuracy_reward/mean": 0.001953125, "rewards/accuracy_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0390625, "rewards/tag_count_reward/std": 0.09219809621572495, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.935546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1956.0, "completions/mean_length": 1954.392578125, "completions/mean_terminated_length": 595.6666870117188, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 3.2074812054634094, "epoch": 0.42471833390235575, "frac_reward_zero_std": 0.28125, "grad_norm": 57.212069984309295, "learning_rate": 7.410958543855983e-07, "loss": 0.0192, "num_tokens": 510266763.0, "reward": 0.0234375, "reward_std": 0.06813475489616394, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.06243881583213806, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01953125, "rewards/tag_count_reward/std": 0.06715766340494156, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1896.0, "completions/mean_length": 1984.77734375, "completions/mean_terminated_length": 429.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 4.142726898193359, "epoch": 0.4250597473540457, "frac_reward_zero_std": 0.8125, "grad_norm": 16.268504198356908, "learning_rate": 7.406101671135721e-07, "loss": 0.0004, "num_tokens": 511359785.0, "reward": 0.00341796875, "reward_std": 0.012434101663529873, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00341796875, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.966796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1248.0, "completions/mean_length": 1989.546875, "completions/mean_terminated_length": 287.5294189453125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 4.842086672782898, "epoch": 0.42540116080573576, "frac_reward_zero_std": 0.9375, "grad_norm": 6.477958993392879, "learning_rate": 7.40124209099975e-07, "loss": -0.0006, "num_tokens": 512466449.0, "reward": 0.0009765625, "reward_std": 0.00390625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1962.0546875, "completions/mean_terminated_length": 947.9000244140625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 4.519717335700989, "epoch": 0.42574257425742573, "frac_reward_zero_std": 0.96875, "grad_norm": 0.03199797078490016, "learning_rate": 7.396379810350591e-07, "loss": 0.0012, "num_tokens": 513550877.0, "reward": 0.0107421875, "reward_std": 0.00390625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0107421875, "rewards/tag_count_reward/std": 0.06358373910188675, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.978515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 2005.96875, "completions/mean_terminated_length": 91.63636779785156, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 4.3883514404296875, "epoch": 0.42608398770911576, "frac_reward_zero_std": 0.96875, "grad_norm": 5.136322063864358, "learning_rate": 7.3915148360946e-07, "loss": -0.0003, "num_tokens": 514660301.0, "reward": 0.00048828125, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00048828125, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.978515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 2006.48046875, "completions/mean_terminated_length": 115.45455169677734, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 4.42693305015564, "epoch": 0.42642540116080574, "frac_reward_zero_std": 0.96875, "grad_norm": 7.86930647895727, "learning_rate": 7.386647175141955e-07, "loss": 0.0069, "num_tokens": 515759235.0, "reward": 0.001953125, "reward_std": 0.0078125, "rewards/accuracy_reward/mean": 0.001953125, "rewards/accuracy_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1403.0, "completions/mean_length": 2038.826171875, "completions/mean_terminated_length": 482.3333435058594, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 5.468417644500732, "epoch": 0.4267668146124957, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.381776834406656e-07, "loss": 0.0, "num_tokens": 516875498.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 2030.939453125, "completions/mean_terminated_length": 301.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 6.1761943101882935, "epoch": 0.42710822806418575, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.376903820806507e-07, "loss": 0.0, "num_tokens": 517987003.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 2037.400390625, "completions/mean_terminated_length": 239.0, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 6.578524708747864, "epoch": 0.4274496415158757, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.372028141263105e-07, "loss": 0.0, "num_tokens": 519101784.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 1968.498046875, "completions/mean_terminated_length": 109.66667175292969, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.728260397911072, "epoch": 0.4277910549675657, "frac_reward_zero_std": 0.96875, "grad_norm": 0.04740129257045049, "learning_rate": 7.36714980270184e-07, "loss": -0.0008, "num_tokens": 520191239.0, "reward": 0.06005859375, "reward_std": 0.037382494658231735, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12414088100194931, "rewards/format_reward/mean": 0.01953125, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.02490234375, "rewards/tag_count_reward/std": 0.14500823616981506, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 2036.431640625, "completions/mean_terminated_length": 863.4000244140625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.911633491516113, "epoch": 0.42813246841925573, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.362268812051872e-07, "loss": 0.0, "num_tokens": 521312916.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1376.0, "completions/mean_length": 2031.548828125, "completions/mean_terminated_length": 363.3999938964844, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 6.8768274784088135, "epoch": 0.4284738818709457, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.357385176246135e-07, "loss": 0.0, "num_tokens": 522431277.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1533.0, "completions/mean_length": 2043.734375, "completions/mean_terminated_length": 956.0, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "entropy": 7.017455339431763, "epoch": 0.42881529532263574, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.352498902221315e-07, "loss": 0.0, "num_tokens": 523551861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 2040.04296875, "completions/mean_terminated_length": 11.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 6.98773729801178, "epoch": 0.4291567087743257, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.347609996917849e-07, "loss": 0.0, "num_tokens": 524670683.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 2040.69921875, "completions/mean_terminated_length": 179.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 6.916937589645386, "epoch": 0.4294981222260157, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.342718467279908e-07, "loss": 0.0, "num_tokens": 525791713.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 2041.66796875, "completions/mean_terminated_length": 427.0, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 6.919025301933289, "epoch": 0.4298395356777057, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.337824320255394e-07, "loss": 0.0, "num_tokens": 526913111.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1434.0, "completions/mean_length": 2027.939453125, "completions/mean_terminated_length": 580.7142944335938, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 6.790865898132324, "epoch": 0.4301809491293957, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.33292756279592e-07, "loss": 0.0, "num_tokens": 528032840.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 2044.775390625, "completions/mean_terminated_length": 397.0, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "entropy": 6.811714053153992, "epoch": 0.43052236258108567, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.32802820185682e-07, "loss": 0.0, "num_tokens": 529154213.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1081.0, "completions/mean_length": 2040.62109375, "completions/mean_terminated_length": 788.6666870117188, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "entropy": 6.827486991882324, "epoch": 0.4308637760327757, "frac_reward_zero_std": 0.96875, "grad_norm": 1.670829486911588, "learning_rate": 7.32312624439711e-07, "loss": 0.0001, "num_tokens": 530276739.0, "reward": 0.00048828125, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00048828125, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "entropy": 7.09375, "epoch": 0.4312051894844657, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.318221697379505e-07, "loss": 0.0, "num_tokens": 531398259.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1231.0, "completions/mean_length": 2044.36328125, "completions/mean_terminated_length": 1117.0, "completions/min_length": 1003.0, "completions/min_terminated_length": 1003.0, "entropy": 7.405757188796997, "epoch": 0.4315466029361557, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.313314567770397e-07, "loss": 0.0, "num_tokens": 532531597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "entropy": 7.7587890625, "epoch": 0.4318880163878457, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.308404862539841e-07, "loss": 0.0, "num_tokens": 533660205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 2041.4453125, "completions/mean_terminated_length": 370.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.945307016372681, "epoch": 0.43222942983953566, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.303492588661555e-07, "loss": 0.0, "num_tokens": 534787089.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "entropy": 7.9111328125, "epoch": 0.4325708432912257, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.298577753112905e-07, "loss": 0.0, "num_tokens": 535921569.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "entropy": 7.779296875, "epoch": 0.43291225674291567, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.293660362874892e-07, "loss": 0.0, "num_tokens": 537048177.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "entropy": 7.716796875, "epoch": 0.43325367019460564, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.288740424932151e-07, "loss": 0.0, "num_tokens": 538169457.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "entropy": 7.5654296875, "epoch": 0.4335950836462957, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.283817946272933e-07, "loss": 0.0, "num_tokens": 539289489.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 2044.03515625, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 7.45487642288208, "epoch": 0.43393649709798565, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.278892933889098e-07, "loss": 0.0, "num_tokens": 540413299.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 2040.486328125, "completions/mean_terminated_length": 124.5, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 7.323737382888794, "epoch": 0.4342779105496757, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.273965394776105e-07, "loss": 0.0, "num_tokens": 541536860.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 2040.671875, "completions/mean_terminated_length": 172.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.248687505722046, "epoch": 0.43461932400136566, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.269035335933e-07, "loss": 0.0, "num_tokens": 542653588.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 2045.2109375, "completions/mean_terminated_length": 620.0, "completions/min_length": 620.0, "completions/min_terminated_length": 620.0, "entropy": 7.176558971405029, "epoch": 0.43496073745305563, "frac_reward_zero_std": 0.96875, "grad_norm": 1.6921661316243855, "learning_rate": 7.264102764362412e-07, "loss": 0.0, "num_tokens": 543772544.0, "reward": 0.00048828125, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00048828125, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "entropy": 6.3837890625, "epoch": 0.43530215090474567, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.259167687070534e-07, "loss": 0.0, "num_tokens": 544889056.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 2044.390625, "completions/mean_terminated_length": 200.0, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 6.058590888977051, "epoch": 0.43564356435643564, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.254230111067126e-07, "loss": 0.0, "num_tokens": 546011816.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 2044.029296875, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 5.823675036430359, "epoch": 0.4359849778081256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.249290043365487e-07, "loss": 0.0, "num_tokens": 547127799.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 2033.181640625, "completions/mean_terminated_length": 151.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 5.515985369682312, "epoch": 0.43632639125981565, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.244347490982463e-07, "loss": 0.0, "num_tokens": 548246564.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "entropy": 5.2490234375, "epoch": 0.4366678047115056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.239402460938427e-07, "loss": 0.0, "num_tokens": 549374708.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 2040.10546875, "completions/mean_terminated_length": 27.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 4.943152666091919, "epoch": 0.43700921816319566, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.234454960257271e-07, "loss": 0.0, "num_tokens": 550489978.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.966796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2044.01171875, "completions/mean_terminated_length": 1927.88232421875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 4.740756988525391, "epoch": 0.43735063161488563, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.229504995966393e-07, "loss": 0.0, "num_tokens": 551624048.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 2017.15234375, "completions/mean_terminated_length": 73.75, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 4.721948862075806, "epoch": 0.4376920450665756, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.224552575096699e-07, "loss": 0.0, "num_tokens": 552730846.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 2040.130859375, "completions/mean_terminated_length": 33.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 4.449633240699768, "epoch": 0.43803345851826564, "frac_reward_zero_std": 0.96875, "grad_norm": 4.993113491777296, "learning_rate": 7.219597704682572e-07, "loss": 0.0, "num_tokens": 553850161.0, "reward": 0.00048828125, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00048828125, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2024.482421875, "completions/mean_terminated_length": 1500.681884765625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 3.530362606048584, "epoch": 0.4383748719699556, "frac_reward_zero_std": 0.96875, "grad_norm": 11.629805912424718, "learning_rate": 7.214640391761887e-07, "loss": 0.0, "num_tokens": 554977080.0, "reward": 0.00048828125, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00048828125, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.97265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1616.0, "completions/mean_length": 2000.888671875, "completions/mean_terminated_length": 325.0714416503906, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 3.721114456653595, "epoch": 0.4387162854216456, "frac_reward_zero_std": 0.9375, "grad_norm": 30.051401607495187, "learning_rate": 7.209680643375978e-07, "loss": 0.0, "num_tokens": 556073215.0, "reward": 0.00146484375, "reward_std": 0.004621601663529873, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00146484375, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 2032.181640625, "completions/mean_terminated_length": 23.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 4.846101403236389, "epoch": 0.4390576988733356, "frac_reward_zero_std": 0.9375, "grad_norm": 16.437597534643583, "learning_rate": 7.204718466569645e-07, "loss": 0.0075, "num_tokens": 557192908.0, "reward": 0.0009765625, "reward_std": 0.00390625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 2036.076171875, "completions/mean_terminated_length": 13.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 4.978646874427795, "epoch": 0.4393991123250256, "frac_reward_zero_std": 0.96875, "grad_norm": 1.9364025191550587, "learning_rate": 7.199753868391138e-07, "loss": -0.0005, "num_tokens": 558311379.0, "reward": 0.00048828125, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00048828125, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 2040.755859375, "completions/mean_terminated_length": 193.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 5.352870464324951, "epoch": 0.43974052577671563, "frac_reward_zero_std": 0.96875, "grad_norm": 1.9119318209945644, "learning_rate": 7.194786855892135e-07, "loss": 0.006, "num_tokens": 559435526.0, "reward": 0.00048828125, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00048828125, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "entropy": 5.55078125, "epoch": 0.4400819392284056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.18981743612776e-07, "loss": 0.0, "num_tokens": 560559958.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "entropy": 5.7265625, "epoch": 0.4404233526800956, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.184845616156543e-07, "loss": 0.0, "num_tokens": 561684310.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2036.63671875, "completions/mean_terminated_length": 1684.375, "completions/min_length": 1142.0, "completions/min_terminated_length": 1142.0, "entropy": 5.565494418144226, "epoch": 0.4407647661317856, "frac_reward_zero_std": 0.9375, "grad_norm": 4.7047024303725005, "learning_rate": 7.17987140304043e-07, "loss": 0.0056, "num_tokens": 562813468.0, "reward": 0.0126953125, "reward_std": 0.006702834274619818, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0126953125, "rewards/tag_count_reward/std": 0.07053722441196442, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "entropy": 5.2958984375, "epoch": 0.4411061795834756, "frac_reward_zero_std": 0.9375, "grad_norm": 15.893593331430958, "learning_rate": 7.174894803844765e-07, "loss": 0.0, "num_tokens": 563939324.0, "reward": 0.00439453125, "reward_std": 0.005987482611089945, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00439453125, "rewards/tag_count_reward/std": 0.032885149121284485, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1861.0, "completions/mean_length": 2044.1328125, "completions/mean_terminated_length": 1553.0, "completions/min_length": 928.0, "completions/min_terminated_length": 928.0, "entropy": 3.9458271861076355, "epoch": 0.44144759303516556, "frac_reward_zero_std": 0.625, "grad_norm": 63.363881004661955, "learning_rate": 7.169915825638277e-07, "loss": 0.0059, "num_tokens": 565062672.0, "reward": 0.02490234375, "reward_std": 0.04053648188710213, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.06243881583213806, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.02099609375, "rewards/tag_count_reward/std": 0.06940890848636627, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.810546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1856.783203125, "completions/mean_terminated_length": 1038.690673828125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 2.180778682231903, "epoch": 0.4417890064868556, "frac_reward_zero_std": 0.15625, "grad_norm": 54.23198896872311, "learning_rate": 7.164934475493081e-07, "loss": 0.0326, "num_tokens": 566088081.0, "reward": 0.2490234375, "reward_std": 0.08323083817958832, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2490234375, "rewards/tag_count_reward/std": 0.1013558954000473, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.771484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1835.98046875, "completions/mean_terminated_length": 1120.1881103515625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 2.5482189655303955, "epoch": 0.44213041993854557, "frac_reward_zero_std": 0.375, "grad_norm": 58.873842784481454, "learning_rate": 7.159950760484658e-07, "loss": 0.0017, "num_tokens": 567104519.0, "reward": 0.2568359375, "reward_std": 0.06079227477312088, "rewards/accuracy_reward/mean": 0.001953125, "rewards/accuracy_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2548828125, "rewards/tag_count_reward/std": 0.08112218230962753, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.861328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1802.0, "completions/mean_length": 1879.19921875, "completions/mean_terminated_length": 830.7323608398438, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 2.109836459159851, "epoch": 0.4424718333902356, "frac_reward_zero_std": 0.15625, "grad_norm": 47.75298412414839, "learning_rate": 7.154964687691844e-07, "loss": 0.0211, "num_tokens": 568144765.0, "reward": 0.32177734375, "reward_std": 0.10963590443134308, "rewards/accuracy_reward/mean": 0.025390625, "rewards/accuracy_reward/std": 0.15746226906776428, "rewards/format_reward/mean": 0.001953125, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.29443359375, "rewards/tag_count_reward/std": 0.11642619967460632, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.908203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1941.31640625, "completions/mean_terminated_length": 885.8297729492188, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 2.9485703110694885, "epoch": 0.4428132468419256, "frac_reward_zero_std": 0.375, "grad_norm": 93.16461079145716, "learning_rate": 7.149976264196833e-07, "loss": 0.0023, "num_tokens": 569214447.0, "reward": 0.27294921875, "reward_std": 0.05831799656152725, "rewards/accuracy_reward/mean": 0.001953125, "rewards/accuracy_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.27099609375, "rewards/tag_count_reward/std": 0.08080795407295227, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.91796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1971.0, "completions/mean_length": 1936.369140625, "completions/mean_terminated_length": 687.1666870117188, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 3.6260375380516052, "epoch": 0.44315466029361555, "frac_reward_zero_std": 0.53125, "grad_norm": 48.409534199537205, "learning_rate": 7.144985497085148e-07, "loss": 0.0047, "num_tokens": 570283388.0, "reward": 0.25537109375, "reward_std": 0.03992920368909836, "rewards/accuracy_reward/mean": 0.001953125, "rewards/accuracy_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.25341796875, "rewards/tag_count_reward/std": 0.050564687699079514, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.876953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1969.0, "completions/mean_length": 1886.23046875, "completions/mean_terminated_length": 733.3016357421875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 3.797580659389496, "epoch": 0.4434960737453056, "frac_reward_zero_std": 0.5, "grad_norm": 25.58824158154414, "learning_rate": 7.13999239344565e-07, "loss": 0.0057, "num_tokens": 571321522.0, "reward": 0.2529296875, "reward_std": 0.05417773872613907, "rewards/accuracy_reward/mean": 0.008064515888690948, "rewards/accuracy_reward/std": 0.0895301103591919, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2451171875, "rewards/tag_count_reward/std": 0.056179627776145935, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1902.447265625, "completions/mean_terminated_length": 717.232177734375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.095020413398743, "epoch": 0.44383748719699556, "frac_reward_zero_std": 0.375, "grad_norm": 170.33867026552983, "learning_rate": 7.13499696037051e-07, "loss": -0.0071, "num_tokens": 572374439.0, "reward": 0.21923828125, "reward_std": 0.07764068245887756, "rewards/accuracy_reward/mean": 0.005859375, "rewards/accuracy_reward/std": 0.07639661431312561, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.21337890625, "rewards/tag_count_reward/std": 0.09253818541765213, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.939453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 1962.162109375, "completions/mean_terminated_length": 630.290283203125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.6874771118164062, "epoch": 0.44417890064868554, "frac_reward_zero_std": 0.0625, "grad_norm": 131.97478457487202, "learning_rate": 7.129999204955214e-07, "loss": 0.0074, "num_tokens": 573465386.0, "reward": 0.04248046875, "reward_std": 0.09234219789505005, "rewards/accuracy_reward/mean": 0.001953125, "rewards/accuracy_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.04052734375, "rewards/tag_count_reward/std": 0.09354464709758759, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1859.0, "completions/mean_length": 2037.125, "completions/mean_terminated_length": 1120.0, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 4.926165342330933, "epoch": 0.44452031410037557, "frac_reward_zero_std": 0.90625, "grad_norm": 6.875214143506856, "learning_rate": 7.124999134298544e-07, "loss": 0.0, "num_tokens": 574590090.0, "reward": 0.00146484375, "reward_std": 0.005859375, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00146484375, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 2043.39453125, "completions/mean_terminated_length": 869.0, "completions/min_length": 729.0, "completions/min_terminated_length": 729.0, "entropy": 5.623869180679321, "epoch": 0.44486172755206554, "frac_reward_zero_std": 0.96875, "grad_norm": 5.561051140032619, "learning_rate": 7.119996755502572e-07, "loss": -0.0, "num_tokens": 575712052.0, "reward": 0.00048828125, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00048828125, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 2041.87890625, "completions/mean_terminated_length": 481.0, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 5.609886288642883, "epoch": 0.4452031410037555, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.114992075672648e-07, "loss": 0.0, "num_tokens": 576833366.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1452.0, "completions/mean_length": 2043.2890625, "completions/mean_terminated_length": 1244.0, "completions/min_length": 984.0, "completions/min_terminated_length": 984.0, "entropy": 5.588392853736877, "epoch": 0.44554455445544555, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.10998510191739e-07, "loss": 0.0, "num_tokens": 577952682.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1099.0, "completions/mean_length": 2042.158203125, "completions/mean_terminated_length": 552.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 5.467584609985352, "epoch": 0.4458859679071355, "frac_reward_zero_std": 0.96875, "grad_norm": 2.597155245795108, "learning_rate": 7.104975841348673e-07, "loss": 0.0001, "num_tokens": 579073307.0, "reward": 0.00048828125, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00048828125, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 2038.408203125, "completions/mean_terminated_length": 411.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 5.534635663032532, "epoch": 0.44622738135882556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.099964301081621e-07, "loss": 0.0, "num_tokens": 580197676.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1376.0, "completions/mean_length": 2018.921875, "completions/mean_terminated_length": 559.2000122070312, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 5.529334664344788, "epoch": 0.44656879481051553, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.0949504882346e-07, "loss": 0.0, "num_tokens": 581306996.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1904.0, "completions/mean_length": 2019.455078125, "completions/mean_terminated_length": 830.0833740234375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 5.47739839553833, "epoch": 0.4469102082622055, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.089934409929198e-07, "loss": 0.0, "num_tokens": 582415693.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.939453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2005.75, "completions/mean_terminated_length": 1350.1934814453125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.111805200576782, "epoch": 0.44725162171389554, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.084916073290223e-07, "loss": 0.0, "num_tokens": 583531597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.966796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1946.0, "completions/mean_length": 2012.5078125, "completions/mean_terminated_length": 979.058837890625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 5.258828401565552, "epoch": 0.4475930351655855, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.079895485445694e-07, "loss": 0.0, "num_tokens": 584648017.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.94921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1617.0, "completions/mean_length": 1974.8125, "completions/mean_terminated_length": 606.7692260742188, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 4.961986303329468, "epoch": 0.4479344486172755, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.07487265352682e-07, "loss": 0.0, "num_tokens": 585742225.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.927734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1873.0, "completions/mean_length": 1955.919921875, "completions/mean_terminated_length": 773.8108520507812, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 5.046663522720337, "epoch": 0.4482758620689655, "frac_reward_zero_std": 0.96875, "grad_norm": 2.9148876597729196, "learning_rate": 7.069847584668009e-07, "loss": 0.0001, "num_tokens": 586815624.0, "reward": 0.00048828125, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00048828125, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.91015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1715.0, "completions/mean_length": 1926.72265625, "completions/mean_terminated_length": 698.1304321289062, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 5.5220701694488525, "epoch": 0.4486172755206555, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.064820286006832e-07, "loss": 0.0, "num_tokens": 587881978.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1953.810546875, "completions/mean_terminated_length": 708.4166870117188, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 5.890812873840332, "epoch": 0.44895868897234553, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.059790764684039e-07, "loss": 0.0, "num_tokens": 588971817.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.93359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1779.0, "completions/mean_length": 1954.76171875, "completions/mean_terminated_length": 643.941162109375, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 6.269213318824768, "epoch": 0.4493001024240355, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.054759027843532e-07, "loss": 0.0, "num_tokens": 590055519.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1905.24609375, "completions/mean_terminated_length": 525.2916870117188, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 6.439913272857666, "epoch": 0.4496415158757255, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.049725082632362e-07, "loss": 0.0, "num_tokens": 591103405.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.888671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1376.0, "completions/mean_length": 1882.671875, "completions/mean_terminated_length": 562.9473876953125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 6.707048654556274, "epoch": 0.4499829293274155, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.044688936200712e-07, "loss": 0.0, "num_tokens": 592140245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.923828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1771.0, "completions/mean_length": 1935.900390625, "completions/mean_terminated_length": 576.3333129882812, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 6.837719917297363, "epoch": 0.4503243427791055, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.039650595701898e-07, "loss": 0.0, "num_tokens": 593211010.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1794.0, "completions/mean_length": 1947.955078125, "completions/mean_terminated_length": 625.138916015625, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 6.919134855270386, "epoch": 0.45066575623079547, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.034610068292349e-07, "loss": 0.0, "num_tokens": 594276491.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.94921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1791.0, "completions/mean_length": 1977.548828125, "completions/mean_terminated_length": 660.6538696289062, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 6.993581056594849, "epoch": 0.4510071696824855, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.029567361131601e-07, "loss": 0.0, "num_tokens": 595370468.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.94921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1160.0, "completions/mean_length": 1970.08203125, "completions/mean_terminated_length": 513.6154174804688, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.150102496147156, "epoch": 0.4513485831341755, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.024522481382284e-07, "loss": 0.0, "num_tokens": 596451662.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1751.0, "completions/mean_length": 1987.568359375, "completions/mean_terminated_length": 641.5909423828125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 7.176490545272827, "epoch": 0.4516899965858655, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.019475436210118e-07, "loss": 0.0, "num_tokens": 597555393.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1348.0, "completions/mean_length": 1971.8671875, "completions/mean_terminated_length": 423.8333435058594, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.222770690917969, "epoch": 0.4520314100375555, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.014426232783896e-07, "loss": 0.0, "num_tokens": 598635965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.94140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1711.0, "completions/mean_length": 1964.412109375, "completions/mean_terminated_length": 621.433349609375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.250318169593811, "epoch": 0.45237282348924546, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.009374878275476e-07, "loss": 0.0, "num_tokens": 599722000.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.955078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1781.0, "completions/mean_length": 1984.205078125, "completions/mean_terminated_length": 627.8695678710938, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 7.312750220298767, "epoch": 0.4527142369409355, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.004321379859774e-07, "loss": 0.0, "num_tokens": 600814633.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.955078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 1981.978515625, "completions/mean_terminated_length": 578.3043823242188, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.417229890823364, "epoch": 0.45305565039262546, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.999265744714747e-07, "loss": 0.0, "num_tokens": 601907070.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.91796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1937.0, "completions/mean_length": 1937.64453125, "completions/mean_terminated_length": 702.7142944335938, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 7.323100686073303, "epoch": 0.45339706384431544, "frac_reward_zero_std": 0.96875, "grad_norm": 0.02395595504799852, "learning_rate": 6.994207980021394e-07, "loss": 0.0035, "num_tokens": 602983240.0, "reward": 0.0146484375, "reward_std": 0.002668476663529873, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0146484375, "rewards/tag_count_reward/std": 0.08293935656547546, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.955078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1161.0, "completions/mean_length": 1976.7265625, "completions/mean_terminated_length": 461.3913269042969, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.403359532356262, "epoch": 0.45373847729600547, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.989148092963732e-07, "loss": 0.0, "num_tokens": 604072460.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.970703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1828.0, "completions/mean_length": 2007.693359375, "completions/mean_terminated_length": 672.2000122070312, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.416198015213013, "epoch": 0.45407989074769545, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.984086090728795e-07, "loss": 0.0, "num_tokens": 605164511.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.974609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1055.0, "completions/mean_length": 2007.6875, "completions/mean_terminated_length": 460.3077087402344, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 7.424793720245361, "epoch": 0.4544213041993855, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.979021980506619e-07, "loss": 0.0, "num_tokens": 606274607.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 2000.90234375, "completions/mean_terminated_length": 708.3333129882812, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.451859712600708, "epoch": 0.45476271765107545, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.973955769490243e-07, "loss": 0.0, "num_tokens": 607382045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 1992.96875, "completions/mean_terminated_length": 287.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.487868428230286, "epoch": 0.45510413110276543, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.96888746487568e-07, "loss": 0.0, "num_tokens": 608485677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1601.0, "completions/mean_length": 2009.390625, "completions/mean_terminated_length": 400.66668701171875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 7.5208481550216675, "epoch": 0.45544554455445546, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.963817073861918e-07, "loss": 0.0, "num_tokens": 609589941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1168.0, "completions/mean_length": 1981.1953125, "completions/mean_terminated_length": 419.23809814453125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.513497471809387, "epoch": 0.45578695800614544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.958744603650916e-07, "loss": 0.0, "num_tokens": 610676617.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.97265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1293.0, "completions/mean_length": 2009.548828125, "completions/mean_terminated_length": 641.7857666015625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 7.517225623130798, "epoch": 0.4561283714578354, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.953670061447576e-07, "loss": 0.0, "num_tokens": 611788114.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1292.0, "completions/mean_length": 2018.634765625, "completions/mean_terminated_length": 544.5, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 7.520344018936157, "epoch": 0.45646978490952544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.948593454459752e-07, "loss": 0.0, "num_tokens": 612901031.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1204.0, "completions/mean_length": 1991.77734375, "completions/mean_terminated_length": 448.77777099609375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.5096131563186646, "epoch": 0.4568111983612154, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.943514789898224e-07, "loss": 0.0, "num_tokens": 613997541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.970703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1803.0, "completions/mean_length": 2008.2421875, "completions/mean_terminated_length": 690.933349609375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 7.540063381195068, "epoch": 0.45715261181290545, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.938434074976701e-07, "loss": 0.0, "num_tokens": 615093905.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1972.0, "completions/mean_length": 2013.7578125, "completions/mean_terminated_length": 587.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 7.535347580909729, "epoch": 0.4574940252645954, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.933351316911798e-07, "loss": 0.0, "num_tokens": 616207061.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.970703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1772.0, "completions/mean_length": 2004.005859375, "completions/mean_terminated_length": 546.3333740234375, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.58722460269928, "epoch": 0.4578354387162854, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.928266522923035e-07, "loss": 0.0, "num_tokens": 617313480.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.966796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1091.0, "completions/mean_length": 1994.25390625, "completions/mean_terminated_length": 429.29412841796875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.592905282974243, "epoch": 0.45817685216797543, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.923179700232826e-07, "loss": 0.0, "num_tokens": 618419642.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1202.0, "completions/mean_length": 1997.515625, "completions/mean_terminated_length": 432.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.596330642700195, "epoch": 0.4585182656196654, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.918090856066463e-07, "loss": 0.0, "num_tokens": 619515458.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1507.0, "completions/mean_length": 2032.568359375, "completions/mean_terminated_length": 731.1666870117188, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "entropy": 7.5570149421691895, "epoch": 0.4588596790713554, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.91299999765211e-07, "loss": 0.0, "num_tokens": 620633045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.974609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1248.0, "completions/mean_length": 2009.189453125, "completions/mean_terminated_length": 519.4615478515625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.567670583724976, "epoch": 0.4592010925230454, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.907907132220794e-07, "loss": 0.0, "num_tokens": 621732438.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 2010.341796875, "completions/mean_terminated_length": 441.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.566466569900513, "epoch": 0.4595425059747354, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.902812267006389e-07, "loss": 0.0, "num_tokens": 622833877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2012.6484375, "completions/mean_terminated_length": 1401.571533203125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.418787002563477, "epoch": 0.4598839194264254, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.897715409245615e-07, "loss": 0.0, "num_tokens": 623951969.0, "reward": 0.0078125, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0078125, "rewards/tag_count_reward/std": 0.043540701270103455, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1267.0, "completions/mean_length": 2018.373046875, "completions/mean_terminated_length": 362.5555725097656, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.509260416030884, "epoch": 0.4602253328781154, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.892616566178017e-07, "loss": 0.0, "num_tokens": 625059088.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1855.0, "completions/mean_length": 2016.181640625, "completions/mean_terminated_length": 690.4166870117188, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.624423623085022, "epoch": 0.4605667463298054, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.887515745045963e-07, "loss": 0.0, "num_tokens": 626172413.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.94921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1307.0, "completions/mean_length": 1975.353515625, "completions/mean_terminated_length": 617.423095703125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.508518815040588, "epoch": 0.4609081597814954, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.882412953094629e-07, "loss": 0.0, "num_tokens": 627263842.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.97265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1323.0, "completions/mean_length": 2008.03515625, "completions/mean_terminated_length": 586.4285888671875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.596044540405273, "epoch": 0.4612495732331854, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.87730819757199e-07, "loss": 0.0, "num_tokens": 628367172.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1209.0, "completions/mean_length": 1983.623046875, "completions/mean_terminated_length": 549.7727661132812, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 7.600500583648682, "epoch": 0.46159098668487536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.872201485728812e-07, "loss": 0.0, "num_tokens": 629460275.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.974609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1740.0, "completions/mean_length": 2006.6796875, "completions/mean_terminated_length": 420.6153869628906, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.586745738983154, "epoch": 0.4619324001365654, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.867092824818639e-07, "loss": 0.0, "num_tokens": 630574223.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.970703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1257.0, "completions/mean_length": 2003.814453125, "completions/mean_terminated_length": 539.800048828125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.616209983825684, "epoch": 0.46227381358825537, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.861982222097785e-07, "loss": 0.0, "num_tokens": 631682992.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.966796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1316.0, "completions/mean_length": 1995.30859375, "completions/mean_terminated_length": 461.058837890625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.617700695991516, "epoch": 0.4626152270399454, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.856869684825316e-07, "loss": 0.0, "num_tokens": 632778558.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.955078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1653.0, "completions/mean_length": 1980.677734375, "completions/mean_terminated_length": 549.3478393554688, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.604271650314331, "epoch": 0.4629566404916354, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.851755220263055e-07, "loss": 0.0, "num_tokens": 633873113.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1264.0, "completions/mean_length": 2013.490234375, "completions/mean_terminated_length": 575.5833740234375, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.549489974975586, "epoch": 0.46329805394332535, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.846638835675554e-07, "loss": 0.0, "num_tokens": 634976772.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.974609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 2015.6484375, "completions/mean_terminated_length": 773.84619140625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 7.550027251243591, "epoch": 0.4636394673950154, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.841520538330096e-07, "loss": 0.0, "num_tokens": 636093344.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.947265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2016.453125, "completions/mean_terminated_length": 1449.77783203125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.407565116882324, "epoch": 0.46398088084670536, "frac_reward_zero_std": 0.96875, "grad_norm": 0.023076499907706157, "learning_rate": 6.836400335496682e-07, "loss": 0.0011, "num_tokens": 637211224.0, "reward": 0.0126953125, "reward_std": 0.013342384248971939, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.06243881583213806, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0087890625, "rewards/tag_count_reward/std": 0.051121458411216736, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.97265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1694.0, "completions/mean_length": 2008.181640625, "completions/mean_terminated_length": 591.7857666015625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.565468668937683, "epoch": 0.46432229429839533, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.831278234448019e-07, "loss": 0.0, "num_tokens": 638340517.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.935546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1977.2734375, "completions/mean_terminated_length": 950.6666870117188, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.416950821876526, "epoch": 0.46466370775008536, "frac_reward_zero_std": 0.96875, "grad_norm": 0.12282503114527375, "learning_rate": 6.826154242459507e-07, "loss": 0.0111, "num_tokens": 639433809.0, "reward": 0.01220703125, "reward_std": 0.004002714995294809, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01220703125, "rewards/tag_count_reward/std": 0.07148420810699463, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.978515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 2010.294921875, "completions/mean_terminated_length": 293.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 7.586134552955627, "epoch": 0.46500512120177534, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.821028366809238e-07, "loss": 0.0, "num_tokens": 640547000.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.939453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1674.0, "completions/mean_length": 1962.841796875, "completions/mean_terminated_length": 641.51611328125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.488121032714844, "epoch": 0.46534653465346537, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0189791431001047, "learning_rate": 6.815900614777972e-07, "loss": -0.0016, "num_tokens": 641633319.0, "reward": 0.01513671875, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01513671875, "rewards/tag_count_reward/std": 0.08503700792789459, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.978515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1884.0, "completions/mean_length": 2020.966796875, "completions/mean_terminated_length": 789.727294921875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 7.629757285118103, "epoch": 0.46568794810515535, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.81077099364914e-07, "loss": 0.0, "num_tokens": 642748342.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.970703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1958.0, "completions/mean_length": 2004.8046875, "completions/mean_terminated_length": 573.6000366210938, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.589628458023071, "epoch": 0.4660293615568453, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.805639510708826e-07, "loss": 0.0, "num_tokens": 643857698.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1227.0, "completions/mean_length": 2030.076171875, "completions/mean_terminated_length": 737.0000610351562, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 7.668140292167664, "epoch": 0.46637077500853535, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.80050617324576e-07, "loss": 0.0, "num_tokens": 644971737.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 2026.5859375, "completions/mean_terminated_length": 829.7777709960938, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 7.6032716035842896, "epoch": 0.46671218846022533, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.795370988551301e-07, "loss": 0.0, "num_tokens": 646079765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.97265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1892.0, "completions/mean_length": 2004.171875, "completions/mean_terminated_length": 445.14288330078125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 7.585037708282471, "epoch": 0.4670536019119153, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.790233963919437e-07, "loss": 0.0, "num_tokens": 647178381.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1268.0, "completions/mean_length": 2020.9453125, "completions/mean_terminated_length": 893.6666870117188, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "entropy": 7.6382728815078735, "epoch": 0.46739501536360534, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.78509510664677e-07, "loss": 0.0, "num_tokens": 648290497.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1771.0, "completions/mean_length": 2025.12890625, "completions/mean_terminated_length": 746.888916015625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 7.622429966926575, "epoch": 0.4677364288152953, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.779954424032501e-07, "loss": 0.0, "num_tokens": 649411987.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.97265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 2008.62890625, "completions/mean_terminated_length": 608.1428833007812, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 7.6350626945495605, "epoch": 0.46807784226698534, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.774811923378424e-07, "loss": 0.0, "num_tokens": 650510597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1139.0, "completions/mean_length": 2013.19921875, "completions/mean_terminated_length": 563.1666870117188, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 7.628583312034607, "epoch": 0.4684192557186753, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.769667611988922e-07, "loss": 0.0, "num_tokens": 651609355.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1461.0, "completions/mean_length": 2002.888671875, "completions/mean_terminated_length": 604.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.6562817096710205, "epoch": 0.4687606691703653, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.764521497170938e-07, "loss": 0.0, "num_tokens": 652706402.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.978515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1349.0, "completions/mean_length": 2012.451171875, "completions/mean_terminated_length": 393.3636474609375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.639953136444092, "epoch": 0.4691020826220553, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.759373586233988e-07, "loss": 0.0, "num_tokens": 653812329.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1149.0, "completions/mean_length": 2009.318359375, "completions/mean_terminated_length": 397.5833435058594, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 7.596144318580627, "epoch": 0.4694434960737453, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.754223886490136e-07, "loss": 0.0, "num_tokens": 654922332.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.974609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1323.0, "completions/mean_length": 2012.1953125, "completions/mean_terminated_length": 637.84619140625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 7.638441443443298, "epoch": 0.4697849095254353, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.749072405253981e-07, "loss": 0.0, "num_tokens": 656030032.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1280.0, "completions/mean_length": 1993.623046875, "completions/mean_terminated_length": 307.9375, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.595116019248962, "epoch": 0.4701263229771253, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.743919149842661e-07, "loss": 0.0, "num_tokens": 657124511.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.978515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 2017.697265625, "completions/mean_terminated_length": 637.5454711914062, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 7.594911575317383, "epoch": 0.4704677364288153, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.738764127575828e-07, "loss": 0.0, "num_tokens": 658238372.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1185.0, "completions/mean_length": 2013.48046875, "completions/mean_terminated_length": 575.1666870117188, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.620149612426758, "epoch": 0.4708091498805053, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.733607345775646e-07, "loss": 0.0, "num_tokens": 659343802.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1889.0, "completions/mean_length": 2035.2109375, "completions/mean_terminated_length": 956.6666870117188, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 7.536513566970825, "epoch": 0.4711505633321953, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.728448811766776e-07, "loss": 0.0, "num_tokens": 660476102.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.951171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2010.546875, "completions/mean_terminated_length": 1280.9599609375, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 7.398463606834412, "epoch": 0.47149197678388527, "frac_reward_zero_std": 0.96875, "grad_norm": 0.02083006896883206, "learning_rate": 6.723288532876372e-07, "loss": 0.0015, "num_tokens": 661590078.0, "reward": 0.0146484375, "reward_std": 0.002668476663529873, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0146484375, "rewards/tag_count_reward/std": 0.08293935656547546, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1316.0, "completions/mean_length": 2022.875, "completions/mean_terminated_length": 618.6666870117188, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 7.638543486595154, "epoch": 0.4718333902355753, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.718126516434065e-07, "loss": 0.0, "num_tokens": 662700590.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1851.0, "completions/mean_length": 2001.53515625, "completions/mean_terminated_length": 726.3333129882812, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.638910889625549, "epoch": 0.4721748036872653, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.71296276977195e-07, "loss": 0.0, "num_tokens": 663795760.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.974609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1829.0, "completions/mean_length": 2016.84765625, "completions/mean_terminated_length": 821.0769653320312, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 7.638675093650818, "epoch": 0.47251621713895525, "frac_reward_zero_std": 0.96875, "grad_norm": 0.2672793693880426, "learning_rate": 6.707797300224585e-07, "loss": -0.0002, "num_tokens": 664903218.0, "reward": 0.00048828125, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00048828125, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1849.0, "completions/mean_length": 2026.056640625, "completions/mean_terminated_length": 443.0000305175781, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 7.5639379024505615, "epoch": 0.4728576305906453, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.702630115128971e-07, "loss": 0.0, "num_tokens": 666025663.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1560.0, "completions/mean_length": 2025.181640625, "completions/mean_terminated_length": 587.625, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.707345008850098, "epoch": 0.47319904404233526, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.69746122182455e-07, "loss": 0.0, "num_tokens": 667137292.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 2028.078125, "completions/mean_terminated_length": 590.857177734375, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.671184420585632, "epoch": 0.4735404574940253, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.692290627653186e-07, "loss": 0.0, "num_tokens": 668253924.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1324.0, "completions/mean_length": 2020.111328125, "completions/mean_terminated_length": 620.1000366210938, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 7.658313512802124, "epoch": 0.47388187094571527, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.68711833995916e-07, "loss": 0.0, "num_tokens": 669361965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1877.0, "completions/mean_length": 2018.85546875, "completions/mean_terminated_length": 390.0, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 7.700890302658081, "epoch": 0.47422328439740524, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.681944366089162e-07, "loss": 0.0, "num_tokens": 670478067.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 2021.375, "completions/mean_terminated_length": 344.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 7.726675629615784, "epoch": 0.4745646978490953, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.676768713392272e-07, "loss": 0.0, "num_tokens": 671584755.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.97265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1137.0, "completions/mean_length": 2002.09375, "completions/mean_terminated_length": 369.14288330078125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.668823599815369, "epoch": 0.47490611130078525, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.67159138921996e-07, "loss": 0.0, "num_tokens": 672693219.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1329.0, "completions/mean_length": 2023.66796875, "completions/mean_terminated_length": 663.7777709960938, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.684971928596497, "epoch": 0.4752475247524752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.666412400926063e-07, "loss": 0.0, "num_tokens": 673808633.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 2026.58984375, "completions/mean_terminated_length": 482.0000305175781, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.685759663581848, "epoch": 0.47558893820416526, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.66123175586679e-07, "loss": 0.0, "num_tokens": 674927927.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1190.0, "completions/mean_length": 2032.529296875, "completions/mean_terminated_length": 727.8333740234375, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "entropy": 7.756325721740723, "epoch": 0.47593035165585523, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.656049461400695e-07, "loss": 0.0, "num_tokens": 676042166.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1418.0, "completions/mean_length": 2033.2890625, "completions/mean_terminated_length": 792.6666870117188, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 7.699577331542969, "epoch": 0.47627176510754526, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.65086552488868e-07, "loss": 0.0, "num_tokens": 677162458.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2024.548828125, "completions/mean_terminated_length": 1547.7083740234375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.508523941040039, "epoch": 0.47661317855923524, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0906882277587447, "learning_rate": 6.645679953693981e-07, "loss": 0.0037, "num_tokens": 678275619.0, "reward": 0.0087890625, "reward_std": 0.002668476663529873, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0087890625, "rewards/tag_count_reward/std": 0.051121458411216736, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1277.0, "completions/mean_length": 2008.97265625, "completions/mean_terminated_length": 382.8333435058594, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.732432126998901, "epoch": 0.4769545920109252, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.640492755182152e-07, "loss": 0.0, "num_tokens": 679378453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1306.0, "completions/mean_length": 2023.53515625, "completions/mean_terminated_length": 482.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 7.781233072280884, "epoch": 0.47729600546261525, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.635303936721056e-07, "loss": 0.0, "num_tokens": 680486023.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2005.646484375, "completions/mean_terminated_length": 1015.3809814453125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 7.627496361732483, "epoch": 0.4776374189143052, "frac_reward_zero_std": 0.96875, "grad_norm": 0.11961555071970938, "learning_rate": 6.630113505680864e-07, "loss": 0.0033, "num_tokens": 681599906.0, "reward": 0.03369140625, "reward_std": 0.01775088720023632, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.1385180652141571, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01416015625, "rewards/tag_count_reward/std": 0.08078429847955704, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 2022.873046875, "completions/mean_terminated_length": 210.1428680419922, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.712944269180298, "epoch": 0.4779788323659952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.624921469434035e-07, "loss": 0.0, "num_tokens": 682712609.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1370.0, "completions/mean_length": 2031.263671875, "completions/mean_terminated_length": 976.875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 7.751688241958618, "epoch": 0.47832024581768523, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.619727835355303e-07, "loss": 0.0, "num_tokens": 683828440.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.955078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1990.115234375, "completions/mean_terminated_length": 759.434814453125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 7.676978826522827, "epoch": 0.4786616592693752, "frac_reward_zero_std": 0.96875, "grad_norm": 0.023924509124916574, "learning_rate": 6.614532610821678e-07, "loss": -0.0012, "num_tokens": 684930515.0, "reward": 0.02099609375, "reward_std": 0.01298497710376978, "rewards/accuracy_reward/mean": 0.005859375, "rewards/accuracy_reward/std": 0.07639661431312561, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01513671875, "rewards/tag_count_reward/std": 0.08503700792789459, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1112.0, "completions/mean_length": 2036.31640625, "completions/mean_terminated_length": 552.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 7.7316190004348755, "epoch": 0.47900307272106524, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.609335803212428e-07, "loss": 0.0, "num_tokens": 686053029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 2033.146484375, "completions/mean_terminated_length": 146.75, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 7.774896264076233, "epoch": 0.4793444861727552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.604137419909064e-07, "loss": 0.0, "num_tokens": 687163296.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 2031.251953125, "completions/mean_terminated_length": 333.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.7630531787872314, "epoch": 0.4796858996244452, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.598937468295344e-07, "loss": 0.0, "num_tokens": 688282817.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1685.0, "completions/mean_length": 2034.375, "completions/mean_terminated_length": 652.7999877929688, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 7.774959444999695, "epoch": 0.4800273130761352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.593735955757246e-07, "loss": 0.0, "num_tokens": 689403281.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 2028.861328125, "completions/mean_terminated_length": 414.8333435058594, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 7.822268009185791, "epoch": 0.4803687265278252, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.58853288968297e-07, "loss": 0.0, "num_tokens": 690517066.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 2024.041015625, "completions/mean_terminated_length": 295.5714416503906, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.803145408630371, "epoch": 0.48071013997951517, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.583328277462919e-07, "loss": 0.0, "num_tokens": 691629215.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 2032.173828125, "completions/mean_terminated_length": 427.3999938964844, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.812495708465576, "epoch": 0.4810515534312052, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.578122126489696e-07, "loss": 0.0, "num_tokens": 692748840.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.955078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2025.400390625, "completions/mean_terminated_length": 1544.9130859375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 7.718473672866821, "epoch": 0.4813929668828952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.572914444158084e-07, "loss": 0.0, "num_tokens": 693868821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 2033.36328125, "completions/mean_terminated_length": 549.2000122070312, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 7.772576808929443, "epoch": 0.48173438033458515, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.567705237865049e-07, "loss": 0.0, "num_tokens": 694993023.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 2022.01171875, "completions/mean_terminated_length": 384.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.822969675064087, "epoch": 0.4820757937862752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.562494515009715e-07, "loss": 0.0, "num_tokens": 696108037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1121.0, "completions/mean_length": 2026.61328125, "completions/mean_terminated_length": 483.71429443359375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 7.747234344482422, "epoch": 0.48241720723796516, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.557282282993363e-07, "loss": 0.0, "num_tokens": 697218607.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 2023.04296875, "completions/mean_terminated_length": 222.57144165039062, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 7.826494455337524, "epoch": 0.4827586206896552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.552068549219415e-07, "loss": 0.0, "num_tokens": 698331301.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1164.0, "completions/mean_length": 2042.421875, "completions/mean_terminated_length": 620.0, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 7.813231468200684, "epoch": 0.48310003414134517, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.546853321093429e-07, "loss": 0.0, "num_tokens": 699451661.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1194.0, "completions/mean_length": 2031.279296875, "completions/mean_terminated_length": 335.8000183105469, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.7940322160720825, "epoch": 0.48344144759303515, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.541636606023086e-07, "loss": 0.0, "num_tokens": 700570828.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2017.41015625, "completions/mean_terminated_length": 1336.0909423828125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.569270968437195, "epoch": 0.4837828610447252, "frac_reward_zero_std": 0.96875, "grad_norm": 0.23858168300973276, "learning_rate": 6.536418411418176e-07, "loss": 0.0032, "num_tokens": 701678446.0, "reward": 0.0322265625, "reward_std": 0.01953125, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.1385180652141571, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0126953125, "rewards/tag_count_reward/std": 0.07392385601997375, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1221.0, "completions/mean_length": 2025.55078125, "completions/mean_terminated_length": 611.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.773271799087524, "epoch": 0.48412427449641515, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.531198744690596e-07, "loss": 0.0, "num_tokens": 702796792.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 2031.169921875, "completions/mean_terminated_length": 324.6000061035156, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.745810866355896, "epoch": 0.48446568794810513, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.525977613254326e-07, "loss": 0.0, "num_tokens": 703910447.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1331.0, "completions/mean_length": 2028.4609375, "completions/mean_terminated_length": 618.857177734375, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.798166036605835, "epoch": 0.48480710139979516, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.520755024525431e-07, "loss": 0.0, "num_tokens": 705025291.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1208.0, "completions/mean_length": 2035.32421875, "completions/mean_terminated_length": 750.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 7.8070374727249146, "epoch": 0.48514851485148514, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.515530985922047e-07, "loss": 0.0, "num_tokens": 706133681.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 2032.56640625, "completions/mean_terminated_length": 72.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 7.812551498413086, "epoch": 0.48548992830317517, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.510305504864369e-07, "loss": 0.0, "num_tokens": 707251139.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 2013.099609375, "completions/mean_terminated_length": 261.1000061035156, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.78221583366394, "epoch": 0.48583134175486514, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.505078588774637e-07, "loss": 0.0, "num_tokens": 708353814.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 2034.14453125, "completions/mean_terminated_length": 629.2000122070312, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.766154646873474, "epoch": 0.4861727552065551, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.499850245077135e-07, "loss": 0.0, "num_tokens": 709470672.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1942.0, "completions/mean_length": 2040.47265625, "completions/mean_terminated_length": 1084.5, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 7.777464032173157, "epoch": 0.48651416865824515, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.494620481198169e-07, "loss": 0.0, "num_tokens": 710588866.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1754.0, "completions/mean_length": 2027.41796875, "completions/mean_terminated_length": 730.75, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 7.812120079994202, "epoch": 0.4868555821099351, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.489389304566067e-07, "loss": 0.0, "num_tokens": 711701528.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1027.0, "completions/mean_length": 2033.931640625, "completions/mean_terminated_length": 607.4000244140625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 7.807507753372192, "epoch": 0.4871969955616251, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.484156722611161e-07, "loss": 0.0, "num_tokens": 712822709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 2027.275390625, "completions/mean_terminated_length": 279.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.806323766708374, "epoch": 0.48753840901331513, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.478922742765782e-07, "loss": 0.0, "num_tokens": 713937762.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1149.0, "completions/mean_length": 2026.896484375, "completions/mean_terminated_length": 504.4285888671875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.8168147802352905, "epoch": 0.4878798224650051, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.473687372464243e-07, "loss": 0.0, "num_tokens": 715057053.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1082.0, "completions/mean_length": 2038.37890625, "completions/mean_terminated_length": 406.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 7.790443181991577, "epoch": 0.48822123591669514, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.468450619142831e-07, "loss": 0.0, "num_tokens": 716180847.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1452.0, "completions/mean_length": 2022.306640625, "completions/mean_terminated_length": 403.625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.8342941999435425, "epoch": 0.4885626493683851, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.463212490239804e-07, "loss": 0.0, "num_tokens": 717291148.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 2028.779296875, "completions/mean_terminated_length": 79.80000305175781, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.760466694831848, "epoch": 0.4889040628200751, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.457972993195369e-07, "loss": 0.0, "num_tokens": 718414859.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 2022.044921875, "completions/mean_terminated_length": 719.1000366210938, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 7.809807300567627, "epoch": 0.4892454762717651, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.452732135451674e-07, "loss": 0.0, "num_tokens": 719524466.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1894.0, "completions/mean_length": 2025.666015625, "completions/mean_terminated_length": 777.4444580078125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.768809080123901, "epoch": 0.4895868897234551, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.447489924452806e-07, "loss": 0.0, "num_tokens": 720638071.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 2045.818359375, "completions/mean_terminated_length": 931.0, "completions/min_length": 931.0, "completions/min_terminated_length": 931.0, "entropy": 7.782857775688171, "epoch": 0.4899283031751451, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.442246367644769e-07, "loss": 0.0, "num_tokens": 721761434.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2018.74609375, "completions/mean_terminated_length": 1423.916748046875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.672024130821228, "epoch": 0.4902697166268351, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.437001472475482e-07, "loss": 0.0, "num_tokens": 722891704.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1773.0, "completions/mean_length": 2036.90625, "completions/mean_terminated_length": 912.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.8177491426467896, "epoch": 0.4906111300785251, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.431755246394763e-07, "loss": 0.0, "num_tokens": 724014104.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 2034.978515625, "completions/mean_terminated_length": 381.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.812253952026367, "epoch": 0.4909525435302151, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.426507696854321e-07, "loss": 0.0, "num_tokens": 725132813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1535.0, "completions/mean_length": 2031.8515625, "completions/mean_terminated_length": 670.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 7.78102707862854, "epoch": 0.4912939569819051, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.421258831307744e-07, "loss": 0.0, "num_tokens": 726256209.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 2040.625, "completions/mean_terminated_length": 160.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 7.746987342834473, "epoch": 0.49163537043359506, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.416008657210492e-07, "loss": 0.0, "num_tokens": 727383777.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.92578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1986.484375, "completions/mean_terminated_length": 1219.157958984375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 7.485952973365784, "epoch": 0.4919767838852851, "frac_reward_zero_std": 0.9375, "grad_norm": 0.17654076872747673, "learning_rate": 6.41075718201988e-07, "loss": 0.0044, "num_tokens": 728482617.0, "reward": 0.05029296875, "reward_std": 0.015746597200632095, "rewards/accuracy_reward/mean": 0.025390625, "rewards/accuracy_reward/std": 0.15746226906776428, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.02490234375, "rewards/tag_count_reward/std": 0.10131227970123291, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1131.0, "completions/mean_length": 2026.33984375, "completions/mean_terminated_length": 463.71429443359375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 7.778422594070435, "epoch": 0.49231819733697507, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.405504413195072e-07, "loss": 0.0, "num_tokens": 729598695.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1098.0, "completions/mean_length": 2026.693359375, "completions/mean_terminated_length": 489.5714416503906, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 7.77836537361145, "epoch": 0.49265961078866505, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.400250358197072e-07, "loss": 0.0, "num_tokens": 730709162.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 2025.638671875, "completions/mean_terminated_length": 412.4285888671875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.78137481212616, "epoch": 0.4930010242403551, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.39499502448871e-07, "loss": 0.0, "num_tokens": 731818705.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1360.0, "completions/mean_length": 2042.603515625, "completions/mean_terminated_length": 1127.0, "completions/min_length": 820.0, "completions/min_terminated_length": 820.0, "entropy": 7.765941143035889, "epoch": 0.49334243769204505, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.389738419534628e-07, "loss": 0.0, "num_tokens": 732949142.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1052.0, "completions/mean_length": 2027.724609375, "completions/mean_terminated_length": 565.0, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 7.771393656730652, "epoch": 0.4936838511437351, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.38448055080128e-07, "loss": 0.0, "num_tokens": 734061513.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1895.0, "completions/mean_length": 2031.333984375, "completions/mean_terminated_length": 829.0000610351562, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 7.823806285858154, "epoch": 0.49402526459542506, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.379221425756913e-07, "loss": 0.0, "num_tokens": 735176996.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 2019.03515625, "completions/mean_terminated_length": 194.25, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 7.794010043144226, "epoch": 0.49436667804711504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.373961051871552e-07, "loss": 0.0, "num_tokens": 736293974.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1111.0, "completions/mean_length": 2033.130859375, "completions/mean_terminated_length": 525.4000244140625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 7.795298933982849, "epoch": 0.49470809149880507, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.368699436617006e-07, "loss": 0.0, "num_tokens": 737406489.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1788.0, "completions/mean_length": 2036.1015625, "completions/mean_terminated_length": 829.6000366210938, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 7.768995761871338, "epoch": 0.49504950495049505, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.363436587466842e-07, "loss": 0.0, "num_tokens": 738527165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 2029.314453125, "completions/mean_terminated_length": 453.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 7.737027645111084, "epoch": 0.495390918402185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.358172511896381e-07, "loss": 0.0, "num_tokens": 739653086.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1365.0, "completions/mean_length": 2030.20703125, "completions/mean_terminated_length": 746.5714721679688, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.7826865911483765, "epoch": 0.49573233185387505, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.352907217382684e-07, "loss": 0.0, "num_tokens": 740772568.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 1991.30078125, "completions/mean_terminated_length": 596.5, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "entropy": 7.735742807388306, "epoch": 0.49607374530556503, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.347640711404545e-07, "loss": 0.0, "num_tokens": 741871506.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1164.0, "completions/mean_length": 2031.130859375, "completions/mean_terminated_length": 608.5, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 7.751599669456482, "epoch": 0.49641515875725506, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.342373001442476e-07, "loss": 0.0, "num_tokens": 742988389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1882.0, "completions/mean_length": 2027.544921875, "completions/mean_terminated_length": 738.875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 7.769779443740845, "epoch": 0.49675657220894504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.337104094978705e-07, "loss": 0.0, "num_tokens": 744101580.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1116.0, "completions/mean_length": 2018.55859375, "completions/mean_terminated_length": 373.1111145019531, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 7.765509009361267, "epoch": 0.497097985660635, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.331833999497157e-07, "loss": 0.0, "num_tokens": 745211178.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1111.0, "completions/mean_length": 2015.8125, "completions/mean_terminated_length": 400.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.767870306968689, "epoch": 0.49743939911232504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.326562722483442e-07, "loss": 0.0, "num_tokens": 746331562.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 2037.576171875, "completions/mean_terminated_length": 269.0, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 7.786652565002441, "epoch": 0.497780812564015, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.32129027142485e-07, "loss": 0.0, "num_tokens": 747464545.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1543.0, "completions/mean_length": 2026.599609375, "completions/mean_terminated_length": 482.71429443359375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.809838771820068, "epoch": 0.498122226015705, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.316016653810344e-07, "loss": 0.0, "num_tokens": 748586308.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1782.0, "completions/mean_length": 2022.248046875, "completions/mean_terminated_length": 583.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 7.7682178020477295, "epoch": 0.498463639467395, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.310741877130537e-07, "loss": 0.0, "num_tokens": 749694387.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1513.0, "completions/mean_length": 2017.37109375, "completions/mean_terminated_length": 479.8000183105469, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.798843264579773, "epoch": 0.498805052919085, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.305465948877691e-07, "loss": 0.0, "num_tokens": 750801473.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1076.0, "completions/mean_length": 2027.17578125, "completions/mean_terminated_length": 524.857177734375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.789110064506531, "epoch": 0.49914646637077503, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.300188876545705e-07, "loss": 0.0, "num_tokens": 751925995.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1960.0, "completions/mean_length": 2028.671875, "completions/mean_terminated_length": 811.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 7.770009279251099, "epoch": 0.499487879822465, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.294910667630099e-07, "loss": 0.0, "num_tokens": 753042515.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 2023.79296875, "completions/mean_terminated_length": 277.4285888671875, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.833716869354248, "epoch": 0.499829293274155, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.289631329628014e-07, "loss": 0.0, "num_tokens": 754157097.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1261.0, "completions/mean_length": 2027.365234375, "completions/mean_terminated_length": 287.16668701171875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.781138062477112, "epoch": 0.500170706725845, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.28435087003819e-07, "loss": 0.0, "num_tokens": 755276068.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 2033.974609375, "completions/mean_terminated_length": 252.75, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 7.793546438217163, "epoch": 0.500512120177535, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.279069296360957e-07, "loss": 0.0, "num_tokens": 756393767.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.955078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1999.595703125, "completions/mean_terminated_length": 970.478271484375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 7.6094359159469604, "epoch": 0.500853533629225, "frac_reward_zero_std": 0.96875, "grad_norm": 0.18155748014344422, "learning_rate": 6.273786616098238e-07, "loss": 0.0032, "num_tokens": 757496024.0, "reward": 0.01513671875, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01513671875, "rewards/tag_count_reward/std": 0.08503700792789459, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1761.0, "completions/mean_length": 2025.65234375, "completions/mean_terminated_length": 617.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 7.797827959060669, "epoch": 0.5011949470809149, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.268502836753516e-07, "loss": 0.0, "num_tokens": 758605302.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1164.0, "completions/mean_length": 2024.705078125, "completions/mean_terminated_length": 557.125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.781368374824524, "epoch": 0.501536360532605, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.263217965831844e-07, "loss": 0.0, "num_tokens": 759710799.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 2041.818359375, "completions/mean_terminated_length": 465.5, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 7.796060562133789, "epoch": 0.501877773984295, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.25793201083982e-07, "loss": 0.0, "num_tokens": 760826354.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1258.0, "completions/mean_length": 2018.490234375, "completions/mean_terminated_length": 537.1000366210938, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.763947010040283, "epoch": 0.5022191874359849, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.252644979285583e-07, "loss": 0.0, "num_tokens": 761941565.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 2036.98828125, "completions/mean_terminated_length": 638.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 7.795804381370544, "epoch": 0.502560600887675, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.247356878678802e-07, "loss": 0.0, "num_tokens": 763060503.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1778.0, "completions/mean_length": 2047.47265625, "completions/mean_terminated_length": 1778.0, "completions/min_length": 1778.0, "completions/min_terminated_length": 1778.0, "entropy": 7.760776877403259, "epoch": 0.502902014339365, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.242067716530666e-07, "loss": 0.0, "num_tokens": 764182153.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.955078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1252.0, "completions/mean_length": 1980.87890625, "completions/mean_terminated_length": 553.8261108398438, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 7.671037673950195, "epoch": 0.503243427791055, "frac_reward_zero_std": 0.96875, "grad_norm": 0.04169558239209276, "learning_rate": 6.23677750035387e-07, "loss": 0.0035, "num_tokens": 765280331.0, "reward": 0.0146484375, "reward_std": 0.002668476663529873, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0146484375, "rewards/tag_count_reward/std": 0.08293935656547546, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2038.91015625, "completions/mean_terminated_length": 1815.300048828125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.5568320751190186, "epoch": 0.5035848412427449, "frac_reward_zero_std": 0.96875, "grad_norm": 3.3783462486237466, "learning_rate": 6.231486237662604e-07, "loss": 0.0, "num_tokens": 766411917.0, "reward": 0.00830078125, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00830078125, "rewards/tag_count_reward/std": 0.047485124319791794, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1142.0, "completions/mean_length": 2028.025390625, "completions/mean_terminated_length": 587.0, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 7.799834966659546, "epoch": 0.503926254694435, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.226193935972549e-07, "loss": 0.0, "num_tokens": 767530282.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 2039.103515625, "completions/mean_terminated_length": 529.6666870117188, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "entropy": 7.780717968940735, "epoch": 0.504267668146125, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.220900602800858e-07, "loss": 0.0, "num_tokens": 768651007.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1600.0, "completions/mean_length": 2019.392578125, "completions/mean_terminated_length": 420.5555725097656, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.748927354812622, "epoch": 0.5046090815978149, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.215606245666152e-07, "loss": 0.0, "num_tokens": 769761000.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1320.0, "completions/mean_length": 2030.169921875, "completions/mean_terminated_length": 743.857177734375, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.765902161598206, "epoch": 0.504950495049505, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.210310872088502e-07, "loss": 0.0, "num_tokens": 770878431.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1258.0, "completions/mean_length": 2021.45703125, "completions/mean_terminated_length": 538.0, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 7.695705771446228, "epoch": 0.505291908501195, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.20501448958943e-07, "loss": 0.0, "num_tokens": 771991673.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 2027.673828125, "completions/mean_terminated_length": 313.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.701486110687256, "epoch": 0.5056333219528849, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.199717105691884e-07, "loss": 0.0, "num_tokens": 773104418.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2033.775390625, "completions/mean_terminated_length": 1701.1905517578125, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.43568754196167, "epoch": 0.5059747354045749, "frac_reward_zero_std": 0.96875, "grad_norm": 0.020502628711450945, "learning_rate": 6.194418727920238e-07, "loss": -0.0, "num_tokens": 774218591.0, "reward": 0.00927734375, "reward_std": 0.003149319440126419, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00927734375, "rewards/tag_count_reward/std": 0.05451139807701111, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1479.0, "completions/mean_length": 2028.375, "completions/mean_terminated_length": 612.5714721679688, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 7.662572979927063, "epoch": 0.506316148856265, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.189119363800277e-07, "loss": 0.0, "num_tokens": 775339263.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1305.0, "completions/mean_length": 2030.5625, "completions/mean_terminated_length": 560.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.672950863838196, "epoch": 0.5066575623079549, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.183819020859187e-07, "loss": 0.0, "num_tokens": 776453535.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 2029.158203125, "completions/mean_terminated_length": 118.5999984741211, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 7.643057465553284, "epoch": 0.5069989757596449, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.178517706625544e-07, "loss": 0.0, "num_tokens": 777571888.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2028.259765625, "completions/mean_terminated_length": 1626.875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 7.568320155143738, "epoch": 0.507340389211335, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.173215428629303e-07, "loss": 0.0, "num_tokens": 778698565.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.974609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1062.0, "completions/mean_length": 2007.18359375, "completions/mean_terminated_length": 440.4615478515625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 7.700070738792419, "epoch": 0.5076818026630249, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.167912194401791e-07, "loss": 0.0, "num_tokens": 779800915.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1084.0, "completions/mean_length": 2014.916015625, "completions/mean_terminated_length": 354.1000061035156, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.630199193954468, "epoch": 0.5080232161147149, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.162608011475687e-07, "loss": 0.0, "num_tokens": 780911208.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1546.0, "completions/mean_length": 2024.609375, "completions/mean_terminated_length": 551.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 7.6303606033325195, "epoch": 0.5083646295664049, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.157302887385028e-07, "loss": 0.0, "num_tokens": 782026224.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1060.0, "completions/mean_length": 2032.1796875, "completions/mean_terminated_length": 428.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 7.665086388587952, "epoch": 0.508706043018095, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.151996829665176e-07, "loss": 0.0, "num_tokens": 783142860.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 2014.046875, "completions/mean_terminated_length": 309.6000061035156, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.644742369651794, "epoch": 0.5090474564697849, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.146689845852825e-07, "loss": 0.0, "num_tokens": 784246724.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1169.0, "completions/mean_length": 2021.51171875, "completions/mean_terminated_length": 352.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.656480073928833, "epoch": 0.5093888699214749, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.141381943485986e-07, "loss": 0.0, "num_tokens": 785353194.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1507.0, "completions/mean_length": 2032.208984375, "completions/mean_terminated_length": 700.5, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 7.621102213859558, "epoch": 0.509730283373165, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.136073130103972e-07, "loss": 0.0, "num_tokens": 786468245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1058.0, "completions/mean_length": 2023.462890625, "completions/mean_terminated_length": 477.625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.618522763252258, "epoch": 0.5100716968248549, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.130763413247388e-07, "loss": 0.0, "num_tokens": 787570930.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 2032.431640625, "completions/mean_terminated_length": 55.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.591973423957825, "epoch": 0.5104131102765449, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.125452800458128e-07, "loss": 0.0, "num_tokens": 788692879.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1319.0, "completions/mean_length": 2009.08203125, "completions/mean_terminated_length": 387.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.6345449686050415, "epoch": 0.5107545237282349, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.120141299279355e-07, "loss": 0.0, "num_tokens": 789789689.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1130.0, "completions/mean_length": 2038.515625, "completions/mean_terminated_length": 429.3333435058594, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.631192564964294, "epoch": 0.5110959371799249, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.114828917255493e-07, "loss": 0.0, "num_tokens": 790910113.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 2033.263671875, "completions/mean_terminated_length": 161.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.614470958709717, "epoch": 0.5114373506316149, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.109515661932221e-07, "loss": 0.0, "num_tokens": 792035848.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 2020.41015625, "completions/mean_terminated_length": 282.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 7.578226923942566, "epoch": 0.5117787640833049, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.104201540856454e-07, "loss": 0.0, "num_tokens": 793148138.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 2037.76171875, "completions/mean_terminated_length": 999.6000366210938, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 7.611136317253113, "epoch": 0.5121201775349948, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.098886561576336e-07, "loss": 0.0, "num_tokens": 794267536.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1337.0, "completions/mean_length": 2026.173828125, "completions/mean_terminated_length": 451.5714416503906, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 7.615459084510803, "epoch": 0.5124615909866849, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.093570731641236e-07, "loss": 0.0, "num_tokens": 795378777.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 2018.49609375, "completions/mean_terminated_length": 537.4000244140625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 7.566625714302063, "epoch": 0.5128030044383749, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.08825405860173e-07, "loss": 0.0, "num_tokens": 796486551.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1208.0, "completions/mean_length": 2027.03515625, "completions/mean_terminated_length": 514.5714721679688, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 7.6081767082214355, "epoch": 0.5131444178900648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.082936550009584e-07, "loss": 0.0, "num_tokens": 797598601.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1141.0, "completions/mean_length": 2038.783203125, "completions/mean_terminated_length": 475.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 7.6084301471710205, "epoch": 0.5134858313417549, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.077618213417761e-07, "loss": 0.0, "num_tokens": 798717978.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1853.0, "completions/mean_length": 2021.84375, "completions/mean_terminated_length": 560.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 7.592054486274719, "epoch": 0.5138272447934449, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.072299056380392e-07, "loss": 0.0, "num_tokens": 799833578.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.978515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1517.0, "completions/mean_length": 2014.875, "completions/mean_terminated_length": 506.18182373046875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.616484761238098, "epoch": 0.5141686582451349, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.066979086452776e-07, "loss": 0.0, "num_tokens": 800940378.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1648.0, "completions/mean_length": 2033.0234375, "completions/mean_terminated_length": 514.4000244140625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.591125726699829, "epoch": 0.5145100716968248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.061658311191371e-07, "loss": 0.0, "num_tokens": 802050006.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 2038.896484375, "completions/mean_terminated_length": 494.3333435058594, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.602350115776062, "epoch": 0.5148514851485149, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.056336738153775e-07, "loss": 0.0, "num_tokens": 803173009.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 2035.52734375, "completions/mean_terminated_length": 451.5, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 7.586207151412964, "epoch": 0.5151928986002049, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.051014374898714e-07, "loss": 0.0, "num_tokens": 804288223.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1076.0, "completions/mean_length": 2024.873046875, "completions/mean_terminated_length": 356.4285888671875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.625764846801758, "epoch": 0.5155343120518948, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.045691228986048e-07, "loss": 0.0, "num_tokens": 805396350.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1983.18359375, "completions/mean_terminated_length": 862.7857666015625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.497337818145752, "epoch": 0.5158757255035848, "frac_reward_zero_std": 0.96875, "grad_norm": 0.13945787919876712, "learning_rate": 6.040367307976739e-07, "loss": 0.0046, "num_tokens": 806485644.0, "reward": 0.01513671875, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01513671875, "rewards/tag_count_reward/std": 0.08503700792789459, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1943.0, "completions/mean_length": 2018.541015625, "completions/mean_terminated_length": 539.7000122070312, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.635939836502075, "epoch": 0.5162171389552749, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.035042619432853e-07, "loss": 0.0, "num_tokens": 807593441.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1736.0, "completions/mean_length": 2030.212890625, "completions/mean_terminated_length": 530.1666870117188, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 7.608349800109863, "epoch": 0.5165585524069648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.029717170917549e-07, "loss": 0.0, "num_tokens": 808713838.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 2031.880859375, "completions/mean_terminated_length": 397.3999938964844, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 7.588972568511963, "epoch": 0.5168999658586548, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.024390969995064e-07, "loss": 0.0, "num_tokens": 809825825.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1065.0, "completions/mean_length": 2040.048828125, "completions/mean_terminated_length": 691.0, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 7.645043969154358, "epoch": 0.5172413793103449, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.019064024230697e-07, "loss": 0.0, "num_tokens": 810944922.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 2036.90234375, "completions/mean_terminated_length": 154.0, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 7.551825404167175, "epoch": 0.5175827927620348, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.013736341190814e-07, "loss": 0.0, "num_tokens": 812068120.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1412.0, "completions/mean_length": 2028.609375, "completions/mean_terminated_length": 629.7142944335938, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.630214095115662, "epoch": 0.5179242062137248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.008407928442829e-07, "loss": 0.0, "num_tokens": 813183424.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1106.0, "completions/mean_length": 2024.982421875, "completions/mean_terminated_length": 364.4285888671875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 7.574969053268433, "epoch": 0.5182656196654148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.003078793555181e-07, "loss": 0.0, "num_tokens": 814298135.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.978515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 2021.537109375, "completions/mean_terminated_length": 816.2727661132812, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 7.548977255821228, "epoch": 0.5186070331171048, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.99774894409735e-07, "loss": 0.0, "num_tokens": 815415770.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1419.0, "completions/mean_length": 2028.529296875, "completions/mean_terminated_length": 623.857177734375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.6209529638290405, "epoch": 0.5189484465687948, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.992418387639816e-07, "loss": 0.0, "num_tokens": 816532201.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1055.0, "completions/mean_length": 2025.89453125, "completions/mean_terminated_length": 431.14288330078125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.593577980995178, "epoch": 0.5192898600204848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.987087131754073e-07, "loss": 0.0, "num_tokens": 817646435.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 2012.615234375, "completions/mean_terminated_length": 236.3000030517578, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.606036067008972, "epoch": 0.5196312734721747, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.981755184012607e-07, "loss": 0.0, "num_tokens": 818748510.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1263.0, "completions/mean_length": 2035.466796875, "completions/mean_terminated_length": 764.6000366210938, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 7.581629753112793, "epoch": 0.5199726869238648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.976422551988885e-07, "loss": 0.0, "num_tokens": 819869453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1414.0, "completions/mean_length": 2028.24609375, "completions/mean_terminated_length": 603.1428833007812, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 7.594037652015686, "epoch": 0.5203141003755548, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.971089243257346e-07, "loss": 0.0, "num_tokens": 820977915.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 2027.20703125, "completions/mean_terminated_length": 273.66668701171875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.582813024520874, "epoch": 0.5206555138272448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.965755265393389e-07, "loss": 0.0, "num_tokens": 822094581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.962890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2035.537109375, "completions/mean_terminated_length": 1712.157958984375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.316666960716248, "epoch": 0.5209969272789348, "frac_reward_zero_std": 0.96875, "grad_norm": 0.08805134580584101, "learning_rate": 5.960420625973368e-07, "loss": 0.0014, "num_tokens": 823216920.0, "reward": 0.0107421875, "reward_std": 0.00390625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0107421875, "rewards/tag_count_reward/std": 0.06358373910188675, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 2007.787109375, "completions/mean_terminated_length": 332.25, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.633825063705444, "epoch": 0.5213383407306248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.955085332574572e-07, "loss": 0.0, "num_tokens": 824319531.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1181.0, "completions/mean_length": 2024.404296875, "completions/mean_terminated_length": 537.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.609009742736816, "epoch": 0.5216797541823148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.949749392775221e-07, "loss": 0.0, "num_tokens": 825428618.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 2030.63671875, "completions/mean_terminated_length": 270.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 7.581810474395752, "epoch": 0.5220211676340047, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.944412814154454e-07, "loss": 0.0, "num_tokens": 826550672.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 2029.994140625, "completions/mean_terminated_length": 511.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 7.580808520317078, "epoch": 0.5223625810856948, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.939075604292317e-07, "loss": 0.0, "num_tokens": 827670829.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1331.0, "completions/mean_length": 2038.357421875, "completions/mean_terminated_length": 813.75, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 7.590463399887085, "epoch": 0.5227039945373848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.933737770769746e-07, "loss": 0.0, "num_tokens": 828787748.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 2024.521484375, "completions/mean_terminated_length": 330.71429443359375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 7.563924312591553, "epoch": 0.5230454079890747, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.928399321168575e-07, "loss": 0.0, "num_tokens": 829901855.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1079.0, "completions/mean_length": 2031.03515625, "completions/mean_terminated_length": 310.8000183105469, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 7.608558297157288, "epoch": 0.5233868214407648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.923060263071503e-07, "loss": 0.0, "num_tokens": 831016305.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2034.1328125, "completions/mean_terminated_length": 1653.5555419921875, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.392565965652466, "epoch": 0.5237282348924548, "frac_reward_zero_std": 0.96875, "grad_norm": 0.05954740256317203, "learning_rate": 5.917720604062098e-07, "loss": 0.0036, "num_tokens": 832141685.0, "reward": 0.02978515625, "reward_std": 0.02001357637345791, "rewards/accuracy_reward/mean": 0.017578125, "rewards/accuracy_reward/std": 0.13154059648513794, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01220703125, "rewards/tag_count_reward/std": 0.07148420810699463, "step": 1534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1445.0, "completions/mean_length": 1996.2890625, "completions/mean_terminated_length": 787.2380981445312, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 7.477241158485413, "epoch": 0.5240696483441447, "frac_reward_zero_std": 0.96875, "grad_norm": 0.023765061235400783, "learning_rate": 5.912380351724782e-07, "loss": 0.0017, "num_tokens": 833253017.0, "reward": 0.044921875, "reward_std": 0.0078125, "rewards/accuracy_reward/mean": 0.029296875, "rewards/accuracy_reward/std": 0.16880230605602264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1317.0, "completions/mean_length": 2026.1953125, "completions/mean_terminated_length": 652.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.556303977966309, "epoch": 0.5244110617958347, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.907039513644817e-07, "loss": 0.0, "num_tokens": 834369005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1182.0, "completions/mean_length": 2036.849609375, "completions/mean_terminated_length": 620.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.6027960777282715, "epoch": 0.5247524752475248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.901698097408299e-07, "loss": 0.0, "num_tokens": 835491312.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1732.0, "completions/mean_length": 2032.44140625, "completions/mean_terminated_length": 1052.25, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "entropy": 7.567371368408203, "epoch": 0.5250938886992147, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.896356110602143e-07, "loss": 0.0, "num_tokens": 836607426.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1081.0, "completions/mean_length": 2024.88671875, "completions/mean_terminated_length": 357.4285888671875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.5815184116363525, "epoch": 0.5254353021509047, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.891013560814078e-07, "loss": 0.0, "num_tokens": 837720280.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1698.0, "completions/mean_length": 2027.1015625, "completions/mean_terminated_length": 519.4285888671875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 7.563825964927673, "epoch": 0.5257767156025948, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.885670455632628e-07, "loss": 0.0, "num_tokens": 838835804.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 2031.7578125, "completions/mean_terminated_length": 384.8000183105469, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.590158462524414, "epoch": 0.5261181290542848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.88032680264711e-07, "loss": 0.0, "num_tokens": 839949024.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 2035.10546875, "completions/mean_terminated_length": 397.5, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 7.5927369594573975, "epoch": 0.5264595425059747, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.874982609447618e-07, "loss": 0.0, "num_tokens": 841065110.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1980.607421875, "completions/mean_terminated_length": 479.5909118652344, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.457626461982727, "epoch": 0.5268009559576647, "frac_reward_zero_std": 0.96875, "grad_norm": 0.03781458074288753, "learning_rate": 5.869637883625013e-07, "loss": 0.0005, "num_tokens": 842166669.0, "reward": 0.0107421875, "reward_std": 0.00390625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0107421875, "rewards/tag_count_reward/std": 0.06358373910188675, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1078.0, "completions/mean_length": 2030.701171875, "completions/mean_terminated_length": 571.8333740234375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 7.623636841773987, "epoch": 0.5271423694093548, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.864292632770911e-07, "loss": 0.0, "num_tokens": 843284788.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1896.0, "completions/mean_length": 2040.41796875, "completions/mean_terminated_length": 1077.5, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "entropy": 7.558241486549377, "epoch": 0.5274837828610447, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.858946864477675e-07, "loss": 0.0, "num_tokens": 844414234.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 2030.23828125, "completions/mean_terminated_length": 532.3333740234375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 7.5839807987213135, "epoch": 0.5278251963127347, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.853600586338406e-07, "loss": 0.0, "num_tokens": 845530004.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1183.0, "completions/mean_length": 2030.30078125, "completions/mean_terminated_length": 537.6666870117188, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 7.5991727113723755, "epoch": 0.5281666097644248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.848253805946924e-07, "loss": 0.0, "num_tokens": 846649950.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1035.0, "completions/mean_length": 2012.517578125, "completions/mean_terminated_length": 231.3000030517578, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.577945828437805, "epoch": 0.5285080232161147, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.842906530897763e-07, "loss": 0.0, "num_tokens": 847762903.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.970703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1750.0, "completions/mean_length": 2006.18359375, "completions/mean_terminated_length": 620.6666870117188, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.590210556983948, "epoch": 0.5288494366678047, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.837558768786166e-07, "loss": 0.0, "num_tokens": 848867589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1172.0, "completions/mean_length": 2034.44921875, "completions/mean_terminated_length": 660.4000244140625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 7.536303400993347, "epoch": 0.5291908501194947, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.832210527208059e-07, "loss": 0.0, "num_tokens": 849993515.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.947265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1281.0, "completions/mean_length": 1975.373046875, "completions/mean_terminated_length": 670.7777709960938, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.421291470527649, "epoch": 0.5295322635711847, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.826861813760056e-07, "loss": 0.0, "num_tokens": 851087562.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 2040.056640625, "completions/mean_terminated_length": 14.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.561365723609924, "epoch": 0.5298736770228747, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.821512636039437e-07, "loss": 0.0, "num_tokens": 852201847.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1202.0, "completions/mean_length": 2035.4375, "completions/mean_terminated_length": 440.0, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 7.605057597160339, "epoch": 0.5302150904745647, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.816163001644143e-07, "loss": 0.0, "num_tokens": 853327767.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1185.0, "completions/mean_length": 2023.373046875, "completions/mean_terminated_length": 471.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.602855682373047, "epoch": 0.5305565039262546, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.810812918172764e-07, "loss": 0.0, "num_tokens": 854444838.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1063.0, "completions/mean_length": 2017.326171875, "completions/mean_terminated_length": 303.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.551914691925049, "epoch": 0.5308979173779447, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.805462393224526e-07, "loss": 0.0, "num_tokens": 855554157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 2019.697265625, "completions/mean_terminated_length": 236.625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.604829668998718, "epoch": 0.5312393308296347, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.800111434399285e-07, "loss": 0.0, "num_tokens": 856661410.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1896.0, "completions/mean_length": 2026.576171875, "completions/mean_terminated_length": 676.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.583415269851685, "epoch": 0.5315807442813247, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.794760049297511e-07, "loss": 0.0, "num_tokens": 857779513.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 2034.041015625, "completions/mean_terminated_length": 618.6000366210938, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.5987056493759155, "epoch": 0.5319221577330147, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.78940824552028e-07, "loss": 0.0, "num_tokens": 858901422.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 2032.673828125, "completions/mean_terminated_length": 740.1666870117188, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 7.6355249881744385, "epoch": 0.5322635711847047, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.784056030669264e-07, "loss": 0.0, "num_tokens": 860012023.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1293.0, "completions/mean_length": 2040.892578125, "completions/mean_terminated_length": 835.0, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "entropy": 7.584054231643677, "epoch": 0.5326049846363947, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.778703412346717e-07, "loss": 0.0, "num_tokens": 861139760.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1263.0, "completions/mean_length": 2035.10546875, "completions/mean_terminated_length": 397.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.56255567073822, "epoch": 0.5329463980880846, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.773350398155467e-07, "loss": 0.0, "num_tokens": 862270646.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1767.0, "completions/mean_length": 2031.337890625, "completions/mean_terminated_length": 626.1666870117188, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.589338779449463, "epoch": 0.5332878115397747, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.767996995698904e-07, "loss": 0.0, "num_tokens": 863383475.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1102.0, "completions/mean_length": 2028.474609375, "completions/mean_terminated_length": 381.8333435058594, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.586252808570862, "epoch": 0.5336292249914647, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.762643212580971e-07, "loss": 0.0, "num_tokens": 864497494.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1115.0, "completions/mean_length": 2021.474609375, "completions/mean_terminated_length": 539.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 7.555133938789368, "epoch": 0.5339706384431546, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.757289056406148e-07, "loss": 0.0, "num_tokens": 865616281.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1182.0, "completions/mean_length": 2032.16796875, "completions/mean_terminated_length": 697.0, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "entropy": 7.551570415496826, "epoch": 0.5343120518948447, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.751934534779448e-07, "loss": 0.0, "num_tokens": 866737407.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1131.0, "completions/mean_length": 2014.49609375, "completions/mean_terminated_length": 332.6000061035156, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.579933047294617, "epoch": 0.5346534653465347, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.746579655306403e-07, "loss": 0.0, "num_tokens": 867845629.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.962890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2033.720703125, "completions/mean_terminated_length": 1663.2105712890625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 7.385553240776062, "epoch": 0.5349948787982246, "frac_reward_zero_std": 0.96875, "grad_norm": 0.06497931926034813, "learning_rate": 5.741224425593052e-07, "loss": 0.0016, "num_tokens": 868968926.0, "reward": 0.01123046875, "reward_std": 0.004002714995294809, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01123046875, "rewards/tag_count_reward/std": 0.06632548570632935, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1274.0, "completions/mean_length": 2027.576171875, "completions/mean_terminated_length": 554.1428833007812, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 7.566982388496399, "epoch": 0.5353362922499146, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.735868853245934e-07, "loss": 0.0, "num_tokens": 870082037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 2030.30078125, "completions/mean_terminated_length": 915.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 7.621805787086487, "epoch": 0.5356777057016047, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.73051294587207e-07, "loss": 0.0, "num_tokens": 871190799.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1527.0, "completions/mean_length": 2024.677734375, "completions/mean_terminated_length": 555.375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.568781614303589, "epoch": 0.5360191191532946, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.725156711078961e-07, "loss": 0.0, "num_tokens": 872304186.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 2036.724609375, "completions/mean_terminated_length": 123.66667175292969, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 7.558135747909546, "epoch": 0.5363605326049846, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.71980015647457e-07, "loss": 0.0, "num_tokens": 873425917.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1126.0, "completions/mean_length": 2015.140625, "completions/mean_terminated_length": 365.6000061035156, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.588443756103516, "epoch": 0.5367019460566747, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.714443289667318e-07, "loss": 0.0, "num_tokens": 874533861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1778.0, "completions/mean_length": 2043.572265625, "completions/mean_terminated_length": 1292.3333740234375, "completions/min_length": 1032.0, "completions/min_terminated_length": 1032.0, "entropy": 7.625870704650879, "epoch": 0.5370433595083647, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.709086118266069e-07, "loss": 0.0, "num_tokens": 875650106.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 2028.1953125, "completions/mean_terminated_length": 20.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.5636887550354, "epoch": 0.5373847729600546, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.703728649880113e-07, "loss": 0.0, "num_tokens": 876766142.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 792.0, "completions/mean_length": 2032.814453125, "completions/mean_terminated_length": 493.0, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 7.633231997489929, "epoch": 0.5377261864117446, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.698370892119171e-07, "loss": 0.0, "num_tokens": 877881919.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.962890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2037.837890625, "completions/mean_terminated_length": 1774.157958984375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 7.3954280614852905, "epoch": 0.5380675998634347, "frac_reward_zero_std": 0.96875, "grad_norm": 0.10914707596074932, "learning_rate": 5.693012852593369e-07, "loss": 0.0018, "num_tokens": 879007484.0, "reward": 0.009765625, "reward_std": 0.0034938561730086803, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.009765625, "rewards/tag_count_reward/std": 0.057698383927345276, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 2030.8671875, "completions/mean_terminated_length": 293.6000061035156, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.58551824092865, "epoch": 0.5384090133151246, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.687654538913238e-07, "loss": 0.0, "num_tokens": 880117416.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1523.0, "completions/mean_length": 2035.13671875, "completions/mean_terminated_length": 401.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 7.568869590759277, "epoch": 0.5387504267668146, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.682295958689691e-07, "loss": 0.0, "num_tokens": 881236734.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1883.0, "completions/mean_length": 2018.380859375, "completions/mean_terminated_length": 531.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.607050180435181, "epoch": 0.5390918402185046, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.676937119534027e-07, "loss": 0.0, "num_tokens": 882350865.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 2037.763671875, "completions/mean_terminated_length": 301.0, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 7.579804539680481, "epoch": 0.5394332536701946, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.67157802905791e-07, "loss": 0.0, "num_tokens": 883482424.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.951171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1784.0, "completions/mean_length": 1978.533203125, "completions/mean_terminated_length": 625.3200073242188, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 7.505564212799072, "epoch": 0.5397746671218846, "frac_reward_zero_std": 0.96875, "grad_norm": 0.022977782976587938, "learning_rate": 5.666218694873359e-07, "loss": -0.0013, "num_tokens": 884580393.0, "reward": 0.02294921875, "reward_std": 0.013217909261584282, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08812850713729858, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01513671875, "rewards/tag_count_reward/std": 0.08503700792789459, "step": 1581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.955078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1992.0546875, "completions/mean_terminated_length": 802.6087036132812, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.449156761169434, "epoch": 0.5401160805735746, "frac_reward_zero_std": 0.96875, "grad_norm": 0.026355885685206453, "learning_rate": 5.660859124592744e-07, "loss": -0.0045, "num_tokens": 885684149.0, "reward": 0.00927734375, "reward_std": 0.003149319440126419, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00927734375, "rewards/tag_count_reward/std": 0.05451139807701111, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 2027.927734375, "completions/mean_terminated_length": 335.16668701171875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 7.613091230392456, "epoch": 0.5404574940252646, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.655499325828763e-07, "loss": 0.0, "num_tokens": 886788704.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1123.0, "completions/mean_length": 2025.23828125, "completions/mean_terminated_length": 383.14288330078125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.676219463348389, "epoch": 0.5407989074769546, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.650139306194448e-07, "loss": 0.0, "num_tokens": 887900506.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1204.0, "completions/mean_length": 2036.287109375, "completions/mean_terminated_length": 548.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.603075265884399, "epoch": 0.5411403209286446, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.644779073303136e-07, "loss": 0.0, "num_tokens": 889030237.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 2040.052734375, "completions/mean_terminated_length": 13.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.570495843887329, "epoch": 0.5414817343803345, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.639418634768474e-07, "loss": 0.0, "num_tokens": 890153928.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.955078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1998.458984375, "completions/mean_terminated_length": 945.1739501953125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 7.532713294029236, "epoch": 0.5418231478320246, "frac_reward_zero_std": 0.96875, "grad_norm": 0.1019085174127145, "learning_rate": 5.634057998204392e-07, "loss": 0.0055, "num_tokens": 891260899.0, "reward": 0.01953125, "reward_std": 0.013975424692034721, "rewards/accuracy_reward/mean": 0.005859375, "rewards/accuracy_reward/std": 0.07639661431312561, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.013671875, "rewards/tag_count_reward/std": 0.07856711745262146, "step": 1587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 2034.23828125, "completions/mean_terminated_length": 286.5, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 7.5425803661346436, "epoch": 0.5421645612837146, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.628697171225113e-07, "loss": 0.0, "num_tokens": 892382029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1066.0, "completions/mean_length": 2042.087890625, "completions/mean_terminated_length": 534.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.635277509689331, "epoch": 0.5425059747354046, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.623336161445123e-07, "loss": 0.0, "num_tokens": 893503034.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.970703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1366.0, "completions/mean_length": 2004.52734375, "completions/mean_terminated_length": 564.1333618164062, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.548904180526733, "epoch": 0.5428473881870945, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.617974976479163e-07, "loss": 0.0, "num_tokens": 894604632.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1931.0, "completions/mean_length": 2030.98828125, "completions/mean_terminated_length": 596.3333740234375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.5592265129089355, "epoch": 0.5431888016387846, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.612613623942238e-07, "loss": 0.0, "num_tokens": 895722050.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1076.0, "completions/mean_length": 2034.296875, "completions/mean_terminated_length": 294.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.5916231870651245, "epoch": 0.5435302150904746, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.607252111449578e-07, "loss": 0.0, "num_tokens": 896846826.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1682.0, "completions/mean_length": 2021.3359375, "completions/mean_terminated_length": 682.7999877929688, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.608749032020569, "epoch": 0.5438716285421645, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.601890446616641e-07, "loss": 0.0, "num_tokens": 897958966.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2033.646484375, "completions/mean_terminated_length": 1698.047607421875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 7.474581718444824, "epoch": 0.5442130419938546, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.596528637059109e-07, "loss": 0.0, "num_tokens": 899080689.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 2031.572265625, "completions/mean_terminated_length": 365.8000183105469, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.587721347808838, "epoch": 0.5445544554455446, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.591166690392863e-07, "loss": 0.0, "num_tokens": 900201254.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 2035.654296875, "completions/mean_terminated_length": 467.75, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 7.5907957553863525, "epoch": 0.5448958688972345, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.585804614233981e-07, "loss": 0.0, "num_tokens": 901329461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1755.0, "completions/mean_length": 2036.6015625, "completions/mean_terminated_length": 880.7999877929688, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 7.610776901245117, "epoch": 0.5452372823489245, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.580442416198725e-07, "loss": 0.0, "num_tokens": 902449961.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 2027.103515625, "completions/mean_terminated_length": 519.5714721679688, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 7.592149972915649, "epoch": 0.5455786958006146, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.575080103903531e-07, "loss": 0.0, "num_tokens": 903569774.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1548.0, "completions/mean_length": 2028.8515625, "completions/mean_terminated_length": 647.4285888671875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 7.615237355232239, "epoch": 0.5459201092523045, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.569717684964992e-07, "loss": 0.0, "num_tokens": 904684466.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1344.0, "completions/mean_length": 2028.76171875, "completions/mean_terminated_length": 640.857177734375, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.587846875190735, "epoch": 0.5462615227039945, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.564355166999862e-07, "loss": 0.0, "num_tokens": 905794216.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 2028.626953125, "completions/mean_terminated_length": 394.8333435058594, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.56122612953186, "epoch": 0.5466029361556846, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.558992557625028e-07, "loss": 0.0, "num_tokens": 906910009.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1885.0, "completions/mean_length": 2033.4609375, "completions/mean_terminated_length": 807.3333740234375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.598585367202759, "epoch": 0.5469443496073745, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.553629864457507e-07, "loss": 0.0, "num_tokens": 908026229.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1762.0, "completions/mean_length": 2038.88671875, "completions/mean_terminated_length": 881.5, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 7.569777131080627, "epoch": 0.5472857630590645, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.54826709511444e-07, "loss": 0.0, "num_tokens": 909148907.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1127.0, "completions/mean_length": 2046.201171875, "completions/mean_terminated_length": 1127.0, "completions/min_length": 1127.0, "completions/min_terminated_length": 1127.0, "entropy": 7.582836866378784, "epoch": 0.5476271765107545, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.542904257213072e-07, "loss": 0.0, "num_tokens": 910279074.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 2024.94921875, "completions/mean_terminated_length": 362.0000305175781, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.5673264265060425, "epoch": 0.5479685899624446, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.537541358370747e-07, "loss": 0.0, "num_tokens": 911387688.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1450.0, "completions/mean_length": 2038.330078125, "completions/mean_terminated_length": 810.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.673214673995972, "epoch": 0.5483100034141345, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.532178406204895e-07, "loss": 0.0, "num_tokens": 912502001.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1406.0, "completions/mean_length": 2027.755859375, "completions/mean_terminated_length": 567.2857666015625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 7.57843554019928, "epoch": 0.5486514168658245, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.526815408333023e-07, "loss": 0.0, "num_tokens": 913610772.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1270.0, "completions/mean_length": 2031.28125, "completions/mean_terminated_length": 336.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 7.566582679748535, "epoch": 0.5489928303175146, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.521452372372701e-07, "loss": 0.0, "num_tokens": 914723588.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.955078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1999.720703125, "completions/mean_terminated_length": 973.2608642578125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 7.442881464958191, "epoch": 0.5493342437692045, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.516089305941553e-07, "loss": 0.0, "num_tokens": 915825349.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 1609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1236.0, "completions/mean_length": 2028.572265625, "completions/mean_terminated_length": 942.7777709960938, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 7.576092958450317, "epoch": 0.5496756572208945, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.510726216657251e-07, "loss": 0.0, "num_tokens": 916938714.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1408.0, "completions/mean_length": 2037.716796875, "completions/mean_terminated_length": 731.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 7.624106407165527, "epoch": 0.5500170706725845, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.505363112137493e-07, "loss": 0.0, "num_tokens": 918058425.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1340.0, "completions/mean_length": 2027.337890625, "completions/mean_terminated_length": 536.7142944335938, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.567732095718384, "epoch": 0.5503584841242745, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.5e-07, "loss": 0.0, "num_tokens": 919169926.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.978515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 2015.265625, "completions/mean_terminated_length": 524.3636474609375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.568628549575806, "epoch": 0.5506998975759645, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.494636887862507e-07, "loss": 0.0, "num_tokens": 920276414.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1406.0, "completions/mean_length": 2025.111328125, "completions/mean_terminated_length": 745.888916015625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 7.577011942863464, "epoch": 0.5510413110276545, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.489273783342749e-07, "loss": 0.0, "num_tokens": 921389671.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1782.0, "completions/mean_length": 2031.8046875, "completions/mean_terminated_length": 666.0, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 7.597971081733704, "epoch": 0.5513827244793444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.483910694058445e-07, "loss": 0.0, "num_tokens": 922504371.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 2037.396484375, "completions/mean_terminated_length": 238.33334350585938, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 7.592065691947937, "epoch": 0.5517241379310345, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.4785476276273e-07, "loss": 0.0, "num_tokens": 923622046.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 2030.375, "completions/mean_terminated_length": 243.1999969482422, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 7.623115181922913, "epoch": 0.5520655513827245, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.473184591666978e-07, "loss": 0.0, "num_tokens": 924741102.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 2020.75390625, "completions/mean_terminated_length": 304.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.577696919441223, "epoch": 0.5524069648344144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.467821593795105e-07, "loss": 0.0, "num_tokens": 925856704.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 2030.71875, "completions/mean_terminated_length": 278.3999938964844, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 7.583246111869812, "epoch": 0.5527483782861045, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.462458641629253e-07, "loss": 0.0, "num_tokens": 926969104.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1755.0, "completions/mean_length": 2028.884765625, "completions/mean_terminated_length": 416.8333435058594, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.58339786529541, "epoch": 0.5530897917377945, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.457095742786929e-07, "loss": 0.0, "num_tokens": 928081861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1041.0, "completions/mean_length": 2040.216796875, "completions/mean_terminated_length": 719.6666870117188, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 7.598436236381531, "epoch": 0.5534312051894845, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.45173290488556e-07, "loss": 0.0, "num_tokens": 929199972.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1146.0, "completions/mean_length": 2025.51953125, "completions/mean_terminated_length": 403.71429443359375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.615379095077515, "epoch": 0.5537726186411744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.446370135542494e-07, "loss": 0.0, "num_tokens": 930313726.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 2016.099609375, "completions/mean_terminated_length": 686.9166870117188, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 7.567462921142578, "epoch": 0.5541140320928645, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.441007442374973e-07, "loss": 0.0, "num_tokens": 931422241.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1152.0, "completions/mean_length": 2023.466796875, "completions/mean_terminated_length": 253.57144165039062, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.606849431991577, "epoch": 0.5544554455445545, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.435644833000138e-07, "loss": 0.0, "num_tokens": 932530848.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1183.0, "completions/mean_length": 2033.28125, "completions/mean_terminated_length": 540.7999877929688, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 7.621047616004944, "epoch": 0.5547968589962444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.430282315035007e-07, "loss": 0.0, "num_tokens": 933643024.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 2034.548828125, "completions/mean_terminated_length": 326.25, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.605542778968811, "epoch": 0.5551382724479345, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.42491989609647e-07, "loss": 0.0, "num_tokens": 934759561.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1993.859375, "completions/mean_terminated_length": 893.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.455562233924866, "epoch": 0.5554796858996245, "frac_reward_zero_std": 0.96875, "grad_norm": 0.040017884321080256, "learning_rate": 5.419557583801274e-07, "loss": -0.003, "num_tokens": 935862465.0, "reward": 0.01904296875, "reward_std": 0.011037028394639492, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.06243881583213806, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01513671875, "rewards/tag_count_reward/std": 0.08503700792789459, "step": 1627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 2030.078125, "completions/mean_terminated_length": 518.6666870117188, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 7.585964202880859, "epoch": 0.5558210993513144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.41419538576602e-07, "loss": 0.0, "num_tokens": 936988137.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1040.0, "completions/mean_length": 2039.041015625, "completions/mean_terminated_length": 519.0, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 7.590684294700623, "epoch": 0.5561625128030044, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.408833309607137e-07, "loss": 0.0, "num_tokens": 938106142.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1263.0, "completions/mean_length": 2031.8671875, "completions/mean_terminated_length": 671.3333740234375, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 7.626255512237549, "epoch": 0.5565039262546945, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.403471362940891e-07, "loss": 0.0, "num_tokens": 939218154.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 2036.8125, "completions/mean_terminated_length": 138.6666717529297, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 7.5942078828811646, "epoch": 0.5568453397063844, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.398109553383359e-07, "loss": 0.0, "num_tokens": 940349194.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 2038.888671875, "completions/mean_terminated_length": 493.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 7.5906054973602295, "epoch": 0.5571867531580744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.392747888550423e-07, "loss": 0.0, "num_tokens": 941473361.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 2024.73046875, "completions/mean_terminated_length": 62.333335876464844, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.583216071128845, "epoch": 0.5575281666097645, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.387386376057759e-07, "loss": 0.0, "num_tokens": 942581127.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1056.0, "completions/mean_length": 2012.3203125, "completions/mean_terminated_length": 221.1999969482422, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.547732949256897, "epoch": 0.5578695800614544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.382025023520835e-07, "loss": 0.0, "num_tokens": 943691627.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1191.0, "completions/mean_length": 2036.37890625, "completions/mean_terminated_length": 560.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.564374923706055, "epoch": 0.5582109935131444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.376663838554878e-07, "loss": 0.0, "num_tokens": 944811245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2031.025390625, "completions/mean_terminated_length": 1634.1429443359375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.3931357860565186, "epoch": 0.5585524069648344, "frac_reward_zero_std": 0.96875, "grad_norm": 0.05690161932531674, "learning_rate": 5.371302828774886e-07, "loss": 0.0012, "num_tokens": 945932426.0, "reward": 0.0107421875, "reward_std": 0.00390625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0107421875, "rewards/tag_count_reward/std": 0.06358373910188675, "step": 1636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1621.0, "completions/mean_length": 1998.501953125, "completions/mean_terminated_length": 841.1904907226562, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 7.477278828620911, "epoch": 0.5588938204165244, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.365942001795606e-07, "loss": 0.0, "num_tokens": 947042411.0, "reward": 0.046875, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 1637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1226.0, "completions/mean_length": 1998.822265625, "completions/mean_terminated_length": 474.3125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.506972670555115, "epoch": 0.5592352338682144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.360581365231528e-07, "loss": 0.0, "num_tokens": 948147088.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.978515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 2017.21484375, "completions/mean_terminated_length": 615.0909423828125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.565404891967773, "epoch": 0.5595766473199044, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.355220926696863e-07, "loss": 0.0, "num_tokens": 949248430.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2044.009765625, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.601002335548401, "epoch": 0.5599180607715945, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.349860693805552e-07, "loss": 0.0, "num_tokens": 950368899.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2028.798828125, "completions/mean_terminated_length": 1579.857177734375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 7.3935911655426025, "epoch": 0.5602594742232844, "frac_reward_zero_std": 0.96875, "grad_norm": 0.04425691502369438, "learning_rate": 5.344500674171237e-07, "loss": 0.0024, "num_tokens": 951486268.0, "reward": 0.00927734375, "reward_std": 0.003149319440126419, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00927734375, "rewards/tag_count_reward/std": 0.05451139807701111, "step": 1641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1141.0, "completions/mean_length": 2017.123046875, "completions/mean_terminated_length": 467.1000061035156, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.594328284263611, "epoch": 0.5606008876749744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.339140875407257e-07, "loss": 0.0, "num_tokens": 952596491.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1859.0, "completions/mean_length": 2024.6796875, "completions/mean_terminated_length": 721.3333129882812, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 7.551305413246155, "epoch": 0.5609423011266644, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.33378130512664e-07, "loss": 0.0, "num_tokens": 953713335.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1110.0, "completions/mean_length": 2038.267578125, "completions/mean_terminated_length": 387.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 7.589643955230713, "epoch": 0.5612837145783544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.328421970942091e-07, "loss": 0.0, "num_tokens": 954839600.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 2021.353515625, "completions/mean_terminated_length": 532.1111450195312, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 7.621083974838257, "epoch": 0.5616251280300444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.323062880465972e-07, "loss": 0.0, "num_tokens": 955958949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1267.0, "completions/mean_length": 2020.384765625, "completions/mean_terminated_length": 477.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.572360634803772, "epoch": 0.5619665414817344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.31770404131031e-07, "loss": 0.0, "num_tokens": 957064026.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 2028.421875, "completions/mean_terminated_length": 377.3333435058594, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.627849817276001, "epoch": 0.5623079549334243, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.312345461086763e-07, "loss": 0.0, "num_tokens": 958175074.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.94140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1323.0, "completions/mean_length": 1964.5703125, "completions/mean_terminated_length": 624.1333618164062, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.530476689338684, "epoch": 0.5626493683851144, "frac_reward_zero_std": 0.96875, "grad_norm": 0.01906768191707098, "learning_rate": 5.306987147406629e-07, "loss": 0.0011, "num_tokens": 959259110.0, "reward": 0.0458984375, "reward_std": 0.002668476663529873, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0146484375, "rewards/tag_count_reward/std": 0.08293935656547546, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 2017.8046875, "completions/mean_terminated_length": 115.5, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 7.599433660507202, "epoch": 0.5629907818368044, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.301629107880827e-07, "loss": 0.0, "num_tokens": 960365570.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1393.0, "completions/mean_length": 2023.853515625, "completions/mean_terminated_length": 674.3333129882812, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.581787705421448, "epoch": 0.5633321952884943, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.296271350119887e-07, "loss": 0.0, "num_tokens": 961475031.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 2021.953125, "completions/mean_terminated_length": 142.85714721679688, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.611051321029663, "epoch": 0.5636736087401844, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.290913881733931e-07, "loss": 0.0, "num_tokens": 962580143.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1649.0, "completions/mean_length": 2033.216796875, "completions/mean_terminated_length": 786.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 7.56494927406311, "epoch": 0.5640150221918744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.285556710332681e-07, "loss": 0.0, "num_tokens": 963701550.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1168.0, "completions/mean_length": 2039.41796875, "completions/mean_terminated_length": 583.3333740234375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 7.584797143936157, "epoch": 0.5643564356435643, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.280199843525429e-07, "loss": 0.0, "num_tokens": 964822564.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1034.0, "completions/mean_length": 2035.41015625, "completions/mean_terminated_length": 436.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 7.552468657493591, "epoch": 0.5646978490952543, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.27484328892104e-07, "loss": 0.0, "num_tokens": 965943270.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1472.0, "completions/mean_length": 2028.09765625, "completions/mean_terminated_length": 349.66668701171875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.63365626335144, "epoch": 0.5650392625469444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.26948705412793e-07, "loss": 0.0, "num_tokens": 967054680.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1090.0, "completions/mean_length": 2028.439453125, "completions/mean_terminated_length": 617.2857666015625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 7.663011789321899, "epoch": 0.5653806759986344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.264131146754067e-07, "loss": 0.0, "num_tokens": 968168665.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.94921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1985.51953125, "completions/mean_terminated_length": 817.6154174804688, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.459473967552185, "epoch": 0.5657220894503243, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0518902273329974, "learning_rate": 5.258775574406948e-07, "loss": -0.0026, "num_tokens": 969257971.0, "reward": 0.01513671875, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01513671875, "rewards/tag_count_reward/std": 0.08503700792789459, "step": 1657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1256.0, "completions/mean_length": 2036.974609375, "completions/mean_terminated_length": 636.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.537240266799927, "epoch": 0.5660635029020143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.253420344693598e-07, "loss": 0.0, "num_tokens": 970382742.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 2023.39453125, "completions/mean_terminated_length": 788.2000122070312, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.6275413036346436, "epoch": 0.5664049163537044, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.248065465220552e-07, "loss": 0.0, "num_tokens": 971499152.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1242.0, "completions/mean_length": 2019.919921875, "completions/mean_terminated_length": 450.5555725097656, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.564443230628967, "epoch": 0.5667463298053943, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.242710943593852e-07, "loss": 0.0, "num_tokens": 972606839.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 2036.103515625, "completions/mean_terminated_length": 1032.8333740234375, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "entropy": 7.587027192115784, "epoch": 0.5670877432570843, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.237356787419028e-07, "loss": 0.0, "num_tokens": 973731244.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1110.0, "completions/mean_length": 2033.31640625, "completions/mean_terminated_length": 544.4000244140625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.595638990402222, "epoch": 0.5674291567087744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.232003004301095e-07, "loss": 0.0, "num_tokens": 974847102.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1093.0, "completions/mean_length": 2029.2109375, "completions/mean_terminated_length": 444.66668701171875, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.595744371414185, "epoch": 0.5677705701604643, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.226649601844531e-07, "loss": 0.0, "num_tokens": 975967882.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2032.6171875, "completions/mean_terminated_length": 1672.952392578125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 7.372831583023071, "epoch": 0.5681119836121543, "frac_reward_zero_std": 0.96875, "grad_norm": 0.019114201034198607, "learning_rate": 5.221296587653282e-07, "loss": 0.0022, "num_tokens": 977095398.0, "reward": 0.00830078125, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00830078125, "rewards/tag_count_reward/std": 0.047485124319791794, "step": 1664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1239.0, "completions/mean_length": 2031.390625, "completions/mean_terminated_length": 630.6666870117188, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 7.597764849662781, "epoch": 0.5684533970638443, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.215943969330735e-07, "loss": 0.0, "num_tokens": 978201550.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1196.0, "completions/mean_length": 2025.033203125, "completions/mean_terminated_length": 578.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.582270503044128, "epoch": 0.5687948105155343, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.210591754479718e-07, "loss": 0.0, "num_tokens": 979313007.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.978515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1175.0, "completions/mean_length": 2018.361328125, "completions/mean_terminated_length": 668.45458984375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 7.600923418998718, "epoch": 0.5691362239672243, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.205239950702488e-07, "loss": 0.0, "num_tokens": 980420792.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1331.0, "completions/mean_length": 1984.474609375, "completions/mean_terminated_length": 569.5909423828125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.479397177696228, "epoch": 0.5694776374189143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.199888565600715e-07, "loss": 0.0, "num_tokens": 981523131.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 1668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1124.0, "completions/mean_length": 2031.37890625, "completions/mean_terminated_length": 346.0, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 7.616657137870789, "epoch": 0.5698190508706042, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.194537606775473e-07, "loss": 0.0, "num_tokens": 982635165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1114.0, "completions/mean_length": 2033.1171875, "completions/mean_terminated_length": 524.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 7.616940855979919, "epoch": 0.5701604643222943, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.189187081827237e-07, "loss": 0.0, "num_tokens": 983755369.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 2041.916015625, "completions/mean_terminated_length": 490.5, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.580217242240906, "epoch": 0.5705018777739843, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.183836998355857e-07, "loss": 0.0, "num_tokens": 984878446.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 2020.421875, "completions/mean_terminated_length": 283.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.579593896865845, "epoch": 0.5708432912256743, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.178487363960563e-07, "loss": 0.0, "num_tokens": 985993078.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 2031.087890625, "completions/mean_terminated_length": 604.8333740234375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 7.5629342794418335, "epoch": 0.5711847046773643, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.173138186239943e-07, "loss": 0.0, "num_tokens": 987115907.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1299.0, "completions/mean_length": 2035.951171875, "completions/mean_terminated_length": 505.75, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 7.621360421180725, "epoch": 0.5715261181290543, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.167789472791942e-07, "loss": 0.0, "num_tokens": 988233722.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.978515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1089.0, "completions/mean_length": 2011.76171875, "completions/mean_terminated_length": 361.2727355957031, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.616816520690918, "epoch": 0.5718675315807443, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.162441231213834e-07, "loss": 0.0, "num_tokens": 989343232.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2040.453125, "completions/mean_terminated_length": 1854.800048828125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 7.414943814277649, "epoch": 0.5722089450324342, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.157093469102236e-07, "loss": 0.0, "num_tokens": 990469336.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 2040.943359375, "completions/mean_terminated_length": 241.5, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 7.629729151725769, "epoch": 0.5725503584841243, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.151746194053077e-07, "loss": 0.0, "num_tokens": 991589035.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1071.0, "completions/mean_length": 2032.90234375, "completions/mean_terminated_length": 502.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 7.568242311477661, "epoch": 0.5728917719358143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.146399413661595e-07, "loss": 0.0, "num_tokens": 992704137.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.92578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2026.654296875, "completions/mean_terminated_length": 1760.394775390625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 7.216583967208862, "epoch": 0.5732331853875042, "frac_reward_zero_std": 0.96875, "grad_norm": 0.627568729446992, "learning_rate": 5.141053135522324e-07, "loss": 0.0008, "num_tokens": 993826872.0, "reward": 0.00146484375, "reward_std": 0.003149319440126419, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00146484375, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1636.0, "completions/mean_length": 2036.88671875, "completions/mean_terminated_length": 625.5, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "entropy": 7.65874171257019, "epoch": 0.5735745988391943, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.13570736722909e-07, "loss": 0.0, "num_tokens": 994944190.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1392.0, "completions/mean_length": 2027.857421875, "completions/mean_terminated_length": 758.875, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "entropy": 7.567225217819214, "epoch": 0.5739160122908843, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.130362116374989e-07, "loss": 0.0, "num_tokens": 996066773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1638.0, "completions/mean_length": 2023.080078125, "completions/mean_terminated_length": 453.125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 7.607573628425598, "epoch": 0.5742574257425742, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.125017390552383e-07, "loss": 0.0, "num_tokens": 997177358.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1148.0, "completions/mean_length": 1999.21875, "completions/mean_terminated_length": 799.2000122070312, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 7.511411666870117, "epoch": 0.5745988391942642, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.11967319735289e-07, "loss": 0.0, "num_tokens": 998276622.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 1683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.978515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1087.0, "completions/mean_length": 2012.794921875, "completions/mean_terminated_length": 409.3636474609375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.537710785865784, "epoch": 0.5749402526459543, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.114329544367374e-07, "loss": 0.0, "num_tokens": 999379109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 2028.732421875, "completions/mean_terminated_length": 75.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.630548357963562, "epoch": 0.5752816660976442, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.108986439185923e-07, "loss": 0.0, "num_tokens": 1000491532.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.978515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1247.0, "completions/mean_length": 2017.498046875, "completions/mean_terminated_length": 628.2727661132812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 7.60116720199585, "epoch": 0.5756230795493342, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.103643889397858e-07, "loss": 0.0, "num_tokens": 1001600331.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.974609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1421.0, "completions/mean_length": 2011.50390625, "completions/mean_terminated_length": 610.6154174804688, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 7.651395916938782, "epoch": 0.5759644930010243, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.098301902591703e-07, "loss": 0.0, "num_tokens": 1002709661.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.919921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1965.935546875, "completions/mean_terminated_length": 1023.195068359375, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.4165695905685425, "epoch": 0.5763059064527143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.092960486355183e-07, "loss": 0.0, "num_tokens": 1003808764.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 1688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1160.0, "completions/mean_length": 2026.634765625, "completions/mean_terminated_length": 224.83334350585938, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.639818072319031, "epoch": 0.5766473199044042, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.087619648275217e-07, "loss": 0.0, "num_tokens": 1004927729.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1847.0, "completions/mean_length": 2021.712890625, "completions/mean_terminated_length": 702.1000366210938, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.604283928871155, "epoch": 0.5769887333560942, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.082279395937903e-07, "loss": 0.0, "num_tokens": 1006035310.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1174.0, "completions/mean_length": 2031.083984375, "completions/mean_terminated_length": 604.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 7.600080370903015, "epoch": 0.5773301468077843, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.076939736928497e-07, "loss": 0.0, "num_tokens": 1007151257.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1320.0, "completions/mean_length": 2021.1875, "completions/mean_terminated_length": 522.6666870117188, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 7.644067287445068, "epoch": 0.5776715602594742, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.071600678831427e-07, "loss": 0.0, "num_tokens": 1008256761.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1475.0, "completions/mean_length": 2025.224609375, "completions/mean_terminated_length": 382.14288330078125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.619562149047852, "epoch": 0.5780129737111642, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.066262229230254e-07, "loss": 0.0, "num_tokens": 1009367036.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 2027.5390625, "completions/mean_terminated_length": 302.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 7.665468811988831, "epoch": 0.5783543871628543, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.060924395707685e-07, "loss": 0.0, "num_tokens": 1010490240.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1151.0, "completions/mean_length": 2042.251953125, "completions/mean_terminated_length": 576.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.660264253616333, "epoch": 0.5786958006145442, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.055587185845545e-07, "loss": 0.0, "num_tokens": 1011613969.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 2018.435546875, "completions/mean_terminated_length": 155.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.618499517440796, "epoch": 0.5790372140662342, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.05025060722478e-07, "loss": 0.0, "num_tokens": 1012718064.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1189.0, "completions/mean_length": 2030.75390625, "completions/mean_terminated_length": 576.3333740234375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 7.627386569976807, "epoch": 0.5793786275179242, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.044914667425427e-07, "loss": 0.0, "num_tokens": 1013842514.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2030.357421875, "completions/mean_terminated_length": 1596.3499755859375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 7.412785530090332, "epoch": 0.5797200409696142, "frac_reward_zero_std": 0.96875, "grad_norm": 0.06852682095060868, "learning_rate": 5.039579374026633e-07, "loss": 0.0033, "num_tokens": 1014961721.0, "reward": 0.01123046875, "reward_std": 0.004002714995294809, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01123046875, "rewards/tag_count_reward/std": 0.06632548570632935, "step": 1698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1883.0, "completions/mean_length": 2026.76953125, "completions/mean_terminated_length": 689.25, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.6228004693984985, "epoch": 0.5800614544213042, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.034244734606612e-07, "loss": 0.0, "num_tokens": 1016067715.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1212.0, "completions/mean_length": 2015.076171875, "completions/mean_terminated_length": 362.3000183105469, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 7.643041253089905, "epoch": 0.5804028678729942, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.028910756742655e-07, "loss": 0.0, "num_tokens": 1017188042.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 2021.44921875, "completions/mean_terminated_length": 348.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.649715065956116, "epoch": 0.5807442813246841, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.023577448011116e-07, "loss": 0.0, "num_tokens": 1018298720.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1188.0, "completions/mean_length": 2010.96875, "completions/mean_terminated_length": 468.0, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 7.666531085968018, "epoch": 0.5810856947763742, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.018244815987395e-07, "loss": 0.0, "num_tokens": 1019398720.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1329.0, "completions/mean_length": 2034.705078125, "completions/mean_terminated_length": 686.6000366210938, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 7.604381442070007, "epoch": 0.5814271082280642, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.012912868245927e-07, "loss": 0.0, "num_tokens": 1020516969.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1800.0, "completions/mean_length": 2023.33203125, "completions/mean_terminated_length": 785.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.5566065311431885, "epoch": 0.5817685216797542, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.007581612360185e-07, "loss": 0.0, "num_tokens": 1021625843.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1184.0, "completions/mean_length": 2021.291015625, "completions/mean_terminated_length": 528.5555419921875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 7.60968804359436, "epoch": 0.5821099351314442, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.002251055902651e-07, "loss": 0.0, "num_tokens": 1022736328.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1398.0, "completions/mean_length": 2011.984375, "completions/mean_terminated_length": 511.3333435058594, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 7.6270833015441895, "epoch": 0.5824513485831342, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.996921206444818e-07, "loss": 0.0, "num_tokens": 1023846176.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1161.0, "completions/mean_length": 2027.5625, "completions/mean_terminated_length": 553.1428833007812, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.619826674461365, "epoch": 0.5827927620348242, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.991592071557171e-07, "loss": 0.0, "num_tokens": 1024963488.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1296.0, "completions/mean_length": 2017.0234375, "completions/mean_terminated_length": 462.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.6417518854141235, "epoch": 0.5831341754865141, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.986263658809185e-07, "loss": 0.0, "num_tokens": 1026078428.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1904.0, "completions/mean_length": 2021.49609375, "completions/mean_terminated_length": 540.2222290039062, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 7.695429801940918, "epoch": 0.5834755889382042, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.980935975769303e-07, "loss": 0.0, "num_tokens": 1027196826.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 2033.35546875, "completions/mean_terminated_length": 173.5, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.602167010307312, "epoch": 0.5838170023898942, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.975609030004938e-07, "loss": 0.0, "num_tokens": 1028314656.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.962890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2025.951171875, "completions/mean_terminated_length": 1453.8421630859375, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "entropy": 7.4809746742248535, "epoch": 0.5841584158415841, "frac_reward_zero_std": 0.96875, "grad_norm": 0.11675811724921531, "learning_rate": 4.97028282908245e-07, "loss": 0.0054, "num_tokens": 1029428935.0, "reward": 0.01318359375, "reward_std": 0.003739949781447649, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01318359375, "rewards/tag_count_reward/std": 0.07628239691257477, "step": 1711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1504.0, "completions/mean_length": 2022.744140625, "completions/mean_terminated_length": 611.2222290039062, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.630227565765381, "epoch": 0.5844998292932742, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.964957380567146e-07, "loss": 0.0, "num_tokens": 1030535188.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2000.07421875, "completions/mean_terminated_length": 879.5238037109375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.5119630098342896, "epoch": 0.5848412427449642, "frac_reward_zero_std": 0.96875, "grad_norm": 0.10046762625339895, "learning_rate": 4.959632692023262e-07, "loss": 0.0044, "num_tokens": 1031643530.0, "reward": 0.01513671875, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01513671875, "rewards/tag_count_reward/std": 0.08503700792789459, "step": 1713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.978515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1906.0, "completions/mean_length": 2015.74609375, "completions/mean_terminated_length": 546.727294921875, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.636378884315491, "epoch": 0.5851826561966541, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.954308771013954e-07, "loss": 0.0, "num_tokens": 1032754104.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.978515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 2014.080078125, "completions/mean_terminated_length": 469.18182373046875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.62941575050354, "epoch": 0.5855240696483441, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.948985625101287e-07, "loss": 0.0, "num_tokens": 1033857697.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.970703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 1997.091796875, "completions/mean_terminated_length": 310.3333435058594, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.6190032958984375, "epoch": 0.5858654831000342, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.943663261846227e-07, "loss": 0.0, "num_tokens": 1034955792.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 2015.1875, "completions/mean_terminated_length": 181.3333282470703, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 7.6334593296051025, "epoch": 0.5862068965517241, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.938341688808628e-07, "loss": 0.0, "num_tokens": 1036065536.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 2033.40625, "completions/mean_terminated_length": 180.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.655667185783386, "epoch": 0.5865483100034141, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.933020913547223e-07, "loss": 0.0, "num_tokens": 1037181184.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1353.0, "completions/mean_length": 2018.732421875, "completions/mean_terminated_length": 549.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.674666166305542, "epoch": 0.5868897234551042, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.927700943619609e-07, "loss": 0.0, "num_tokens": 1038289271.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1829.0, "completions/mean_length": 2001.001953125, "completions/mean_terminated_length": 711.1666870117188, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 7.639730930328369, "epoch": 0.5872311369067942, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.922381786582241e-07, "loss": 0.0, "num_tokens": 1039386664.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 2031.619140625, "completions/mean_terminated_length": 370.6000061035156, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.59669291973114, "epoch": 0.5875725503584841, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.917063449990416e-07, "loss": 0.0, "num_tokens": 1040499045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2027.861328125, "completions/mean_terminated_length": 1557.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.534218430519104, "epoch": 0.5879139638101741, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.91174594139827e-07, "loss": 0.0, "num_tokens": 1041616510.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1590.0, "completions/mean_length": 2029.65234375, "completions/mean_terminated_length": 873.75, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.690843939781189, "epoch": 0.5882553772618642, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.906429268358762e-07, "loss": 0.0, "num_tokens": 1042743132.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.923828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1993.025390625, "completions/mean_terminated_length": 1326.2821044921875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.2734339237213135, "epoch": 0.5885967907135541, "frac_reward_zero_std": 0.9375, "grad_norm": 0.23043689650965657, "learning_rate": 4.901113438423664e-07, "loss": 0.0115, "num_tokens": 1043846937.0, "reward": 0.025390625, "reward_std": 0.007152034435421228, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.025390625, "rewards/tag_count_reward/std": 0.1029878631234169, "step": 1724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1272.0, "completions/mean_length": 2016.853515625, "completions/mean_terminated_length": 453.3000183105469, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 7.6691224575042725, "epoch": 0.5889382041652441, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.895798459143548e-07, "loss": 0.0, "num_tokens": 1044950110.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1080.0, "completions/mean_length": 2020.931640625, "completions/mean_terminated_length": 315.625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.64157247543335, "epoch": 0.5892796176169341, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.890484338067781e-07, "loss": 0.0, "num_tokens": 1046062299.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1079.0, "completions/mean_length": 2018.708984375, "completions/mean_terminated_length": 548.2999877929688, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 7.67125391960144, "epoch": 0.5896210310686241, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.885171082744506e-07, "loss": 0.0, "num_tokens": 1047175526.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1042.0, "completions/mean_length": 2023.40234375, "completions/mean_terminated_length": 473.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.6658923625946045, "epoch": 0.5899624445203141, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.879858700720645e-07, "loss": 0.0, "num_tokens": 1048280644.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2023.5234375, "completions/mean_terminated_length": 1525.8333740234375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 7.3706642389297485, "epoch": 0.5903038579720041, "frac_reward_zero_std": 0.96875, "grad_norm": 0.15090088955807895, "learning_rate": 4.874547199541871e-07, "loss": 0.0051, "num_tokens": 1049405680.0, "reward": 0.00830078125, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00830078125, "rewards/tag_count_reward/std": 0.047485124319791794, "step": 1729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1198.0, "completions/mean_length": 2021.1796875, "completions/mean_terminated_length": 331.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.669951677322388, "epoch": 0.590645271423694, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.869236586752612e-07, "loss": 0.0, "num_tokens": 1050519036.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1236.0, "completions/mean_length": 2023.09765625, "completions/mean_terminated_length": 454.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.598696708679199, "epoch": 0.5909866848753841, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.863926869896029e-07, "loss": 0.0, "num_tokens": 1051630606.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1162.0, "completions/mean_length": 2023.0703125, "completions/mean_terminated_length": 452.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.641067147254944, "epoch": 0.5913280983270741, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.858618056514016e-07, "loss": 0.0, "num_tokens": 1052746178.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 2028.966796875, "completions/mean_terminated_length": 655.857177734375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 7.667669653892517, "epoch": 0.591669511778764, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.853310154147176e-07, "loss": 0.0, "num_tokens": 1053860417.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1073.0, "completions/mean_length": 2021.998046875, "completions/mean_terminated_length": 383.875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 7.639047622680664, "epoch": 0.5920109252304541, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.848003170334826e-07, "loss": 0.0, "num_tokens": 1054974784.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1138.0, "completions/mean_length": 2029.76171875, "completions/mean_terminated_length": 491.66668701171875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 7.663140058517456, "epoch": 0.5923523386821441, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.842697112614972e-07, "loss": 0.0, "num_tokens": 1056085270.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 2017.86328125, "completions/mean_terminated_length": 333.5555725097656, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 7.604213237762451, "epoch": 0.592693752133834, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.837391988524313e-07, "loss": 0.0, "num_tokens": 1057192352.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 2033.962890625, "completions/mean_terminated_length": 610.6000366210938, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.656649947166443, "epoch": 0.593035165585524, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.83208780559821e-07, "loss": 0.0, "num_tokens": 1058304157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1177.0, "completions/mean_length": 2040.2734375, "completions/mean_terminated_length": 729.3333740234375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.660610675811768, "epoch": 0.5933765790372141, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.826784571370698e-07, "loss": 0.0, "num_tokens": 1059423545.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1042.0, "completions/mean_length": 2036.197265625, "completions/mean_terminated_length": 537.25, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 7.646557092666626, "epoch": 0.5937179924889041, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.821482293374457e-07, "loss": 0.0, "num_tokens": 1060540686.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1175.0, "completions/mean_length": 2034.099609375, "completions/mean_terminated_length": 624.6000366210938, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 7.650731921195984, "epoch": 0.594059405940594, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.816180979140815e-07, "loss": 0.0, "num_tokens": 1061660145.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.978515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1413.0, "completions/mean_length": 2018.552734375, "completions/mean_terminated_length": 677.3636474609375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 7.6419984102249146, "epoch": 0.5944008193922841, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.810880636199724e-07, "loss": 0.0, "num_tokens": 1062770492.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 2034.787109375, "completions/mean_terminated_length": 356.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.634580969810486, "epoch": 0.5947422328439741, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.805581272079764e-07, "loss": 0.0, "num_tokens": 1063891391.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1469.0, "completions/mean_length": 2023.890625, "completions/mean_terminated_length": 813.6000366210938, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 7.591762900352478, "epoch": 0.595083646295664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.800282894308116e-07, "loss": 0.0, "num_tokens": 1064994503.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1931.0, "completions/mean_length": 1976.466796875, "completions/mean_terminated_length": 521.9583740234375, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.563186168670654, "epoch": 0.595425059747354, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.794985510410569e-07, "loss": 0.0, "num_tokens": 1066083430.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 1744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2034.583984375, "completions/mean_terminated_length": 1704.550048828125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.520256400108337, "epoch": 0.5957664731990441, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.789689127911498e-07, "loss": 0.0, "num_tokens": 1067205025.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1578.0, "completions/mean_length": 2026.443359375, "completions/mean_terminated_length": 471.2857360839844, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.691043019294739, "epoch": 0.596107886650734, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.784393754333849e-07, "loss": 0.0, "num_tokens": 1068318676.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1862.0, "completions/mean_length": 2028.98828125, "completions/mean_terminated_length": 831.25, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 7.660354018211365, "epoch": 0.596449300102424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.779099397199142e-07, "loss": 0.0, "num_tokens": 1069436782.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 2036.673828125, "completions/mean_terminated_length": 115.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.664470672607422, "epoch": 0.5967907135541141, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.77380606402745e-07, "loss": 0.0, "num_tokens": 1070555511.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1178.0, "completions/mean_length": 2028.798828125, "completions/mean_terminated_length": 643.5714721679688, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.684087872505188, "epoch": 0.597132127005804, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.768513762337396e-07, "loss": 0.0, "num_tokens": 1071667392.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 2031.775390625, "completions/mean_terminated_length": 386.6000061035156, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 7.644893169403076, "epoch": 0.597473540457494, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.763222499646129e-07, "loss": 0.0, "num_tokens": 1072780557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1281.0, "completions/mean_length": 2023.1875, "completions/mean_terminated_length": 636.4444580078125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 7.677917003631592, "epoch": 0.597814953909184, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.757932283469334e-07, "loss": 0.0, "num_tokens": 1073889037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.978515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1098.0, "completions/mean_length": 2011.58203125, "completions/mean_terminated_length": 352.9090881347656, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.637798428535461, "epoch": 0.598156367360874, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.7526431213211973e-07, "loss": 0.0, "num_tokens": 1074990503.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.947265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 1974.86328125, "completions/mean_terminated_length": 661.1111450195312, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.508590936660767, "epoch": 0.598497780812564, "frac_reward_zero_std": 0.96875, "grad_norm": 0.059645686421233184, "learning_rate": 4.7473550207144174e-07, "loss": 0.0035, "num_tokens": 1076082721.0, "reward": 0.0146484375, "reward_std": 0.002668476663529873, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0146484375, "rewards/tag_count_reward/std": 0.08293935656547546, "step": 1753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1152.0, "completions/mean_length": 2038.423828125, "completions/mean_terminated_length": 413.66668701171875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.650040149688721, "epoch": 0.598839194264254, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.74206798916018e-07, "loss": 0.0, "num_tokens": 1077197802.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1105.0, "completions/mean_length": 1981.6640625, "completions/mean_terminated_length": 504.18182373046875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 7.5498950481414795, "epoch": 0.5991806077159441, "frac_reward_zero_std": 0.96875, "grad_norm": 0.017883543981799862, "learning_rate": 4.7367820341681563e-07, "loss": 0.0005, "num_tokens": 1078295246.0, "reward": 0.017578125, "reward_std": 0.0078125, "rewards/accuracy_reward/mean": 0.001953125, "rewards/accuracy_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 1755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1052.0, "completions/mean_length": 2023.07421875, "completions/mean_terminated_length": 224.85714721679688, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.675302147865295, "epoch": 0.599522021167634, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.731497163246482e-07, "loss": 0.0, "num_tokens": 1079411844.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1192.0, "completions/mean_length": 2026.943359375, "completions/mean_terminated_length": 507.857177734375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 7.6389970779418945, "epoch": 0.599863434619324, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.7262133839017624e-07, "loss": 0.0, "num_tokens": 1080533911.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 2030.7265625, "completions/mean_terminated_length": 279.20001220703125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 7.641716718673706, "epoch": 0.600204848071014, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.720930703639041e-07, "loss": 0.0, "num_tokens": 1081651771.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 2009.986328125, "completions/mean_terminated_length": 426.0833435058594, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.656459927558899, "epoch": 0.600546261522704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.7156491299618105e-07, "loss": 0.0, "num_tokens": 1082750324.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.974609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1417.0, "completions/mean_length": 2010.77734375, "completions/mean_terminated_length": 582.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 7.629309773445129, "epoch": 0.600887674974394, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.710368670371985e-07, "loss": 0.0, "num_tokens": 1083860274.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1747.0, "completions/mean_length": 2028.2109375, "completions/mean_terminated_length": 781.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 7.631670236587524, "epoch": 0.601229088426084, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.705089332369901e-07, "loss": 0.0, "num_tokens": 1084974638.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1083.0, "completions/mean_length": 2022.130859375, "completions/mean_terminated_length": 392.375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 7.699518799781799, "epoch": 0.6015705018777739, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.699811123454295e-07, "loss": 0.0, "num_tokens": 1086084961.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 2023.455078125, "completions/mean_terminated_length": 651.6666870117188, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.634135842323303, "epoch": 0.601911915329464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.69453405112231e-07, "loss": 0.0, "num_tokens": 1087191642.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1258.0, "completions/mean_length": 2025.080078125, "completions/mean_terminated_length": 581.125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 7.650080680847168, "epoch": 0.602253328781154, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.689258122869463e-07, "loss": 0.0, "num_tokens": 1088310211.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1150.0, "completions/mean_length": 2025.65234375, "completions/mean_terminated_length": 617.75, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.6531113386154175, "epoch": 0.6025947422328439, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.683983346189656e-07, "loss": 0.0, "num_tokens": 1089424273.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 2016.423828125, "completions/mean_terminated_length": 251.6666717529297, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.593531370162964, "epoch": 0.602936155684534, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.6787097285751487e-07, "loss": 0.0, "num_tokens": 1090531354.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1928.0, "completions/mean_length": 2043.17578125, "completions/mean_terminated_length": 1224.666748046875, "completions/min_length": 777.0, "completions/min_terminated_length": 777.0, "entropy": 7.6645296812057495, "epoch": 0.603277569136224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.673437277516559e-07, "loss": 0.0, "num_tokens": 1091656244.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1142.0, "completions/mean_length": 1990.505859375, "completions/mean_terminated_length": 709.95458984375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 7.513449668884277, "epoch": 0.6036189825879139, "frac_reward_zero_std": 0.96875, "grad_norm": 0.020090043847993726, "learning_rate": 4.668166000502842e-07, "loss": 0.0011, "num_tokens": 1092753447.0, "reward": 0.0390625, "reward_std": 0.013975424692034721, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15143637359142303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 1768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1711.0, "completions/mean_length": 2028.806640625, "completions/mean_terminated_length": 410.16668701171875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 7.562949299812317, "epoch": 0.6039603960396039, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.6628959050212936e-07, "loss": 0.0, "num_tokens": 1093864180.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1199.0, "completions/mean_length": 2019.427734375, "completions/mean_terminated_length": 585.1000366210938, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 7.626249432563782, "epoch": 0.604301809491294, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.657626998557522e-07, "loss": 0.0, "num_tokens": 1094971231.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1868.0, "completions/mean_length": 1989.94921875, "completions/mean_terminated_length": 809.5833740234375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.472697615623474, "epoch": 0.604643222942984, "frac_reward_zero_std": 0.96875, "grad_norm": 0.02115000008733776, "learning_rate": 4.6523592885954553e-07, "loss": -0.0004, "num_tokens": 1096071845.0, "reward": 0.04296875, "reward_std": 0.010673906654119492, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.16324250400066376, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 1771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1056.0, "completions/mean_length": 2022.783203125, "completions/mean_terminated_length": 434.125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 7.624919772148132, "epoch": 0.6049846363946739, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.6470927826173155e-07, "loss": 0.0, "num_tokens": 1097183702.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1335.0, "completions/mean_length": 2037.294921875, "completions/mean_terminated_length": 677.75, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 7.624131917953491, "epoch": 0.605326049846364, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.641827488103619e-07, "loss": 0.0, "num_tokens": 1098307549.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1288.0, "completions/mean_length": 2018.294921875, "completions/mean_terminated_length": 527.1000366210938, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 7.6427202224731445, "epoch": 0.605667463298054, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.6365634125331566e-07, "loss": 0.0, "num_tokens": 1099419268.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1043.0, "completions/mean_length": 2030.57421875, "completions/mean_terminated_length": 561.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.594877600669861, "epoch": 0.6060088767497439, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.631300563382994e-07, "loss": 0.0, "num_tokens": 1100539018.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1269.0, "completions/mean_length": 2009.32421875, "completions/mean_terminated_length": 397.8333435058594, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.640562295913696, "epoch": 0.6063502902014339, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.626038948128448e-07, "loss": 0.0, "num_tokens": 1101644960.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 2038.79296875, "completions/mean_terminated_length": 476.66668701171875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 7.658280611038208, "epoch": 0.606691703653124, "frac_reward_zero_std": 0.96875, "grad_norm": 0.6820205207558061, "learning_rate": 4.6207785742430895e-07, "loss": 0.0, "num_tokens": 1102760934.0, "reward": 0.00048828125, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00048828125, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1101.0, "completions/mean_length": 2027.3828125, "completions/mean_terminated_length": 728.5, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 7.686988592147827, "epoch": 0.6070331171048139, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.615519449198719e-07, "loss": 0.0, "num_tokens": 1103875402.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1095.0, "completions/mean_length": 2029.564453125, "completions/mean_terminated_length": 474.8333435058594, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 7.722477436065674, "epoch": 0.6073745305565039, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.6102615804653724e-07, "loss": 0.0, "num_tokens": 1104989019.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1215.0, "completions/mean_length": 2043.701171875, "completions/mean_terminated_length": 947.5, "completions/min_length": 680.0, "completions/min_terminated_length": 680.0, "entropy": 7.683214545249939, "epoch": 0.607715944008194, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.6050049755112906e-07, "loss": 0.0, "num_tokens": 1106117714.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.962890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 1986.537109375, "completions/mean_terminated_length": 391.7368469238281, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 7.628375768661499, "epoch": 0.6080573574598839, "frac_reward_zero_std": 0.96875, "grad_norm": 0.022212536531081115, "learning_rate": 4.599749641802928e-07, "loss": 0.0005, "num_tokens": 1107210405.0, "reward": 0.01953125, "reward_std": 0.010673906654119492, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.06243881583213806, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 1781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1293.0, "completions/mean_length": 2021.26953125, "completions/mean_terminated_length": 337.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.68739926815033, "epoch": 0.6083987709115739, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.5944955868049276e-07, "loss": 0.0, "num_tokens": 1108315871.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1158.0, "completions/mean_length": 2033.20703125, "completions/mean_terminated_length": 533.2000122070312, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.693054914474487, "epoch": 0.6087401843632639, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.5892428179801213e-07, "loss": 0.0, "num_tokens": 1109434953.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 2040.171875, "completions/mean_terminated_length": 44.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 7.664155960083008, "epoch": 0.6090815978149539, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.5839913427895083e-07, "loss": 0.0, "num_tokens": 1110566529.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 2026.9375, "completions/mean_terminated_length": 250.6666717529297, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.639796137809753, "epoch": 0.6094230112666439, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.578741168692256e-07, "loss": 0.0, "num_tokens": 1111681985.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 2019.85546875, "completions/mean_terminated_length": 246.75, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 7.626237154006958, "epoch": 0.6097644247183339, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.5734923031456783e-07, "loss": 0.0, "num_tokens": 1112794823.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 2036.54296875, "completions/mean_terminated_length": 92.66667175292969, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 7.627734422683716, "epoch": 0.610105838170024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.568244753605237e-07, "loss": 0.0, "num_tokens": 1113918029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1197.0, "completions/mean_length": 2030.30078125, "completions/mean_terminated_length": 537.6666870117188, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 7.658162474632263, "epoch": 0.6104472516217139, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.5629985275245174e-07, "loss": 0.0, "num_tokens": 1115036039.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 2031.177734375, "completions/mean_terminated_length": 325.3999938964844, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.5653005838394165, "epoch": 0.6107886650734039, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.557753632355231e-07, "loss": 0.0, "num_tokens": 1116152178.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1106.0, "completions/mean_length": 2020.892578125, "completions/mean_terminated_length": 313.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.561338543891907, "epoch": 0.6111300785250939, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.5525100755471934e-07, "loss": 0.0, "num_tokens": 1117270907.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 742.0, "completions/mean_length": 2020.3046875, "completions/mean_terminated_length": 275.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.5719674825668335, "epoch": 0.6114714919767839, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.5472678645483264e-07, "loss": 0.0, "num_tokens": 1118381367.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1359.0, "completions/mean_length": 2028.201171875, "completions/mean_terminated_length": 599.857177734375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 7.573107123374939, "epoch": 0.6118129054284739, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.5420270068046315e-07, "loss": 0.0, "num_tokens": 1119488542.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2025.818359375, "completions/mean_terminated_length": 1507.1905517578125, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 7.352357268333435, "epoch": 0.6121543188801639, "frac_reward_zero_std": 0.96875, "grad_norm": 0.06903826214947759, "learning_rate": 4.536787509760196e-07, "loss": 0.0037, "num_tokens": 1120604865.0, "reward": 0.01171875, "reward_std": 0.004034357611089945, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01171875, "rewards/tag_count_reward/std": 0.06895484030246735, "step": 1793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1933.0, "completions/mean_length": 2038.4296875, "completions/mean_terminated_length": 823.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 7.589882254600525, "epoch": 0.6124957323318538, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.531549380857168e-07, "loss": 0.0, "num_tokens": 1121729165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 2036.3671875, "completions/mean_terminated_length": 559.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.524744629859924, "epoch": 0.6128371457835439, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.5263126275357575e-07, "loss": 0.0, "num_tokens": 1122860393.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 2037.935546875, "completions/mean_terminated_length": 330.3333435058594, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 7.529447317123413, "epoch": 0.6131785592352339, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.521077257234217e-07, "loss": 0.0, "num_tokens": 1123985960.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1761.0, "completions/mean_length": 2028.15625, "completions/mean_terminated_length": 596.5714721679688, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.545132279396057, "epoch": 0.6135199726869238, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.515843277388839e-07, "loss": 0.0, "num_tokens": 1125101592.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 2044.16796875, "completions/mean_terminated_length": 86.0, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 7.489269018173218, "epoch": 0.6138613861386139, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.5106106954339327e-07, "loss": 0.0, "num_tokens": 1126239038.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 2026.24609375, "completions/mean_terminated_length": 456.857177734375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.492049694061279, "epoch": 0.6142027995903039, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.5053795188018316e-07, "loss": 0.0, "num_tokens": 1127353228.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 2012.97265625, "completions/mean_terminated_length": 55.33333206176758, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.4862672090530396, "epoch": 0.6145442130419938, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.5001497549228653e-07, "loss": 0.0, "num_tokens": 1128464782.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1125.0, "completions/mean_length": 2030.580078125, "completions/mean_terminated_length": 264.20001220703125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.50241219997406, "epoch": 0.6148856264936838, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.494921411225363e-07, "loss": 0.0, "num_tokens": 1129573687.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 2040.697265625, "completions/mean_terminated_length": 178.5, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 7.50924551486969, "epoch": 0.6152270399453739, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.4896944951356295e-07, "loss": 0.0, "num_tokens": 1130692284.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1134.0, "completions/mean_length": 2040.9609375, "completions/mean_terminated_length": 846.6666870117188, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "entropy": 7.5282933712005615, "epoch": 0.6155684533970639, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.484469014077953e-07, "loss": 0.0, "num_tokens": 1131807800.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "entropy": 7.4912109375, "epoch": 0.6159098668487538, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.479244975474569e-07, "loss": 0.0, "num_tokens": 1132939064.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 2032.546875, "completions/mean_terminated_length": 70.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.501731872558594, "epoch": 0.6162512803004438, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.4740223867456737e-07, "loss": 0.0, "num_tokens": 1134053136.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 2028.30859375, "completions/mean_terminated_length": 367.66668701171875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 7.462884783744812, "epoch": 0.6165926937521339, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.4688012553094033e-07, "loss": 0.0, "num_tokens": 1135173134.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2005.181640625, "completions/mean_terminated_length": 1004.047607421875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.26753830909729, "epoch": 0.6169341072038238, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.463581588581823e-07, "loss": 0.0, "num_tokens": 1136284507.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 1807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 2029.49609375, "completions/mean_terminated_length": 153.1999969482422, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.441762447357178, "epoch": 0.6172755206555138, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.4583633939769127e-07, "loss": 0.0, "num_tokens": 1137406425.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1125.0, "completions/mean_length": 2029.09765625, "completions/mean_terminated_length": 435.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.487030625343323, "epoch": 0.6176169341072039, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.453146678906571e-07, "loss": 0.0, "num_tokens": 1138517691.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1064.0, "completions/mean_length": 2042.666015625, "completions/mean_terminated_length": 682.5, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "entropy": 7.406032919883728, "epoch": 0.6179583475588938, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.4479314507805856e-07, "loss": 0.0, "num_tokens": 1139641376.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 2028.53515625, "completions/mean_terminated_length": 624.2857666015625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 7.452839970588684, "epoch": 0.6182997610105838, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.4427177170066387e-07, "loss": 0.0, "num_tokens": 1140756946.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "entropy": 7.447265625, "epoch": 0.6186411744622738, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.4375054849902847e-07, "loss": 0.0, "num_tokens": 1141876818.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 2038.359375, "completions/mean_terminated_length": 402.66668701171875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 7.502947568893433, "epoch": 0.6189825879139638, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.4322947621349517e-07, "loss": 0.0, "num_tokens": 1142993146.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 2029.72265625, "completions/mean_terminated_length": 176.40000915527344, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 7.548621773719788, "epoch": 0.6193240013656538, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.427085555841915e-07, "loss": 0.0, "num_tokens": 1144108876.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1180.0, "completions/mean_length": 2038.6015625, "completions/mean_terminated_length": 444.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.424819350242615, "epoch": 0.6196654148173438, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.4218778735103045e-07, "loss": 0.0, "num_tokens": 1145231680.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1757.0, "completions/mean_length": 2028.4609375, "completions/mean_terminated_length": 380.66668701171875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.48325252532959, "epoch": 0.6200068282690337, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.416671722537081e-07, "loss": 0.0, "num_tokens": 1146344668.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.962890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1999.755859375, "completions/mean_terminated_length": 747.9473876953125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 7.36272406578064, "epoch": 0.6203482417207238, "frac_reward_zero_std": 0.96875, "grad_norm": 0.028427006036537847, "learning_rate": 4.411467110317031e-07, "loss": 0.0046, "num_tokens": 1147453183.0, "reward": 0.01611328125, "reward_std": 0.008778205141425133, "rewards/accuracy_reward/mean": 0.001953125, "rewards/accuracy_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01416015625, "rewards/tag_count_reward/std": 0.08078429847955704, "step": 1817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1142.0, "completions/mean_length": 2017.990234375, "completions/mean_terminated_length": 340.77777099609375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.422549843788147, "epoch": 0.6206896551724138, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.4062640442427534e-07, "loss": 0.0, "num_tokens": 1148570714.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 2039.328125, "completions/mean_terminated_length": 568.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 7.4812517166137695, "epoch": 0.6210310686241038, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.401062531704658e-07, "loss": 0.0, "num_tokens": 1149696626.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1257.0, "completions/mean_length": 2026.69921875, "completions/mean_terminated_length": 490.0000305175781, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.485160946846008, "epoch": 0.6213724820757938, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.3958625800909365e-07, "loss": 0.0, "num_tokens": 1150810520.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 2024.611328125, "completions/mean_terminated_length": 337.2857360839844, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.473076343536377, "epoch": 0.6217138955274838, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.3906641967875747e-07, "loss": 0.0, "num_tokens": 1151932977.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 2039.5703125, "completions/mean_terminated_length": 609.3333740234375, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "entropy": 7.420729637145996, "epoch": 0.6220553089791738, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.3854673891783224e-07, "loss": 0.0, "num_tokens": 1153057685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1071.0, "completions/mean_length": 2032.154296875, "completions/mean_terminated_length": 425.3999938964844, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 7.446094393730164, "epoch": 0.6223967224308637, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.3802721646446985e-07, "loss": 0.0, "num_tokens": 1154177620.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1195.0, "completions/mean_length": 2037.369140625, "completions/mean_terminated_length": 687.25, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 7.474501013755798, "epoch": 0.6227381358825538, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.375078530565967e-07, "loss": 0.0, "num_tokens": 1155295505.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 2035.248046875, "completions/mean_terminated_length": 415.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.49000346660614, "epoch": 0.6230795493342438, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.369886494319137e-07, "loss": 0.0, "num_tokens": 1156408400.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1259.0, "completions/mean_length": 2027.064453125, "completions/mean_terminated_length": 261.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.447984337806702, "epoch": 0.6234209627859337, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.3646960632789444e-07, "loss": 0.0, "num_tokens": 1157522721.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2034.595703125, "completions/mean_terminated_length": 1721.1905517578125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.230923414230347, "epoch": 0.6237623762376238, "frac_reward_zero_std": 0.96875, "grad_norm": 0.1828682829544293, "learning_rate": 4.3595072448178505e-07, "loss": 0.0, "num_tokens": 1158639186.0, "reward": 0.009765625, "reward_std": 0.0078125, "rewards/accuracy_reward/mean": 0.001953125, "rewards/accuracy_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0078125, "rewards/tag_count_reward/std": 0.043540701270103455, "step": 1827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 2031.416015625, "completions/mean_terminated_length": 632.8333740234375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 7.48283064365387, "epoch": 0.6241037896893138, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.3543200463060183e-07, "loss": 0.0, "num_tokens": 1159760855.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1127.0, "completions/mean_length": 2041.14453125, "completions/mean_terminated_length": 878.0, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "entropy": 7.446705222129822, "epoch": 0.6244452031410037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.349134475111319e-07, "loss": 0.0, "num_tokens": 1160881537.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1706.0, "completions/mean_length": 2028.888671875, "completions/mean_terminated_length": 650.1428833007812, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 7.38653028011322, "epoch": 0.6247866165926937, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.343950538599305e-07, "loss": 0.0, "num_tokens": 1161997064.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 2037.013671875, "completions/mean_terminated_length": 173.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.370685577392578, "epoch": 0.6251280300443838, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.338768244133212e-07, "loss": 0.0, "num_tokens": 1163123151.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1066.0, "completions/mean_length": 2027.31640625, "completions/mean_terminated_length": 535.1428833007812, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.412006378173828, "epoch": 0.6254694434960737, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.333587599073937e-07, "loss": 0.0, "num_tokens": 1164244113.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1031.0, "completions/mean_length": 2029.71484375, "completions/mean_terminated_length": 487.66668701171875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 7.482152462005615, "epoch": 0.6258108569477637, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.328408610780041e-07, "loss": 0.0, "num_tokens": 1165362879.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1243.0, "completions/mean_length": 2040.486328125, "completions/mean_terminated_length": 765.6666870117188, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 7.421726226806641, "epoch": 0.6261522703994538, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.323231286607727e-07, "loss": 0.0, "num_tokens": 1166485432.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1345.0, "completions/mean_length": 2019.923828125, "completions/mean_terminated_length": 251.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.414015531539917, "epoch": 0.6264936838511438, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.3180556339108385e-07, "loss": 0.0, "num_tokens": 1167597073.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 2042.115234375, "completions/mean_terminated_length": 1043.666748046875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.477341890335083, "epoch": 0.6268350973028337, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.3128816600408393e-07, "loss": 0.0, "num_tokens": 1168728284.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1084.0, "completions/mean_length": 2033.375, "completions/mean_terminated_length": 550.4000244140625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 7.453456878662109, "epoch": 0.6271765107545237, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.307709372346816e-07, "loss": 0.0, "num_tokens": 1169841980.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 2036.869140625, "completions/mean_terminated_length": 148.33334350585938, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.413187742233276, "epoch": 0.6275179242062138, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.302538778175452e-07, "loss": 0.0, "num_tokens": 1170959785.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1076.0, "completions/mean_length": 2038.462890625, "completions/mean_terminated_length": 420.3333435058594, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.446491241455078, "epoch": 0.6278593376579037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.2973698848710293e-07, "loss": 0.0, "num_tokens": 1172080086.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 2041.814453125, "completions/mean_terminated_length": 464.5, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 7.433682203292847, "epoch": 0.6282007511095937, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.2922026997754156e-07, "loss": 0.0, "num_tokens": 1173194967.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2031.5703125, "completions/mean_terminated_length": 1647.4285888671875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.299561977386475, "epoch": 0.6285421645612838, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.28703723022805e-07, "loss": 0.0, "num_tokens": 1174316171.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1162.0, "completions/mean_length": 2034.29296875, "completions/mean_terminated_length": 293.5, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.445752501487732, "epoch": 0.6288835780129737, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.2818734835659355e-07, "loss": 0.0, "num_tokens": 1175434577.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1324.0, "completions/mean_length": 2030.845703125, "completions/mean_terminated_length": 291.3999938964844, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.50017249584198, "epoch": 0.6292249914646637, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.276711467123628e-07, "loss": 0.0, "num_tokens": 1176542098.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 2028.69140625, "completions/mean_terminated_length": 70.80000305175781, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.440855503082275, "epoch": 0.6295664049163537, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.271551188233224e-07, "loss": 0.0, "num_tokens": 1177656772.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1081.0, "completions/mean_length": 2036.3671875, "completions/mean_terminated_length": 559.0, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 7.4170156717300415, "epoch": 0.6299078183680437, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.2663926542243555e-07, "loss": 0.0, "num_tokens": 1178771328.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 2044.869140625, "completions/mean_terminated_length": 445.0, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "entropy": 7.481279611587524, "epoch": 0.6302492318197337, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.261235872424173e-07, "loss": 0.0, "num_tokens": 1179899261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 2028.33203125, "completions/mean_terminated_length": 34.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.4594186544418335, "epoch": 0.6305906452714237, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.2560808501573395e-07, "loss": 0.0, "num_tokens": 1181009655.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1187.0, "completions/mean_length": 2028.841796875, "completions/mean_terminated_length": 413.16668701171875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 7.475073575973511, "epoch": 0.6309320587231136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.250927594746018e-07, "loss": 0.0, "num_tokens": 1182121862.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 2032.87109375, "completions/mean_terminated_length": 111.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 7.386651992797852, "epoch": 0.6312734721748037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.2457761135098644e-07, "loss": 0.0, "num_tokens": 1183241796.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2030.97265625, "completions/mean_terminated_length": 1632.857177734375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.204477667808533, "epoch": 0.6316148856264937, "frac_reward_zero_std": 0.96875, "grad_norm": 0.8554598483629041, "learning_rate": 4.2406264137660106e-07, "loss": 0.0002, "num_tokens": 1184377270.0, "reward": 0.0009765625, "reward_std": 0.002668476663529873, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2025.87109375, "completions/mean_terminated_length": 1418.5555419921875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 7.313198208808899, "epoch": 0.6319562990781836, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0687441739356757, "learning_rate": 4.235478502829062e-07, "loss": 0.002, "num_tokens": 1185492500.0, "reward": 0.017578125, "reward_std": 0.011933790519833565, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.06243881583213806, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.013671875, "rewards/tag_count_reward/std": 0.07856711745262146, "step": 1851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 2042.107421875, "completions/mean_terminated_length": 539.5, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 7.432624578475952, "epoch": 0.6322977125298737, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.23033238801108e-07, "loss": 0.0, "num_tokens": 1186613835.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 2033.498046875, "completions/mean_terminated_length": 191.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.3118098974227905, "epoch": 0.6326391259815637, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.2251880766215764e-07, "loss": 0.0, "num_tokens": 1187745002.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 2041.029296875, "completions/mean_terminated_length": 263.5, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 7.418975353240967, "epoch": 0.6329805394332537, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.220045575967499e-07, "loss": 0.0, "num_tokens": 1188864393.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1951.0, "completions/mean_length": 2036.681640625, "completions/mean_terminated_length": 599.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.436370372772217, "epoch": 0.6333219528849436, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.2149048933532306e-07, "loss": 0.0, "num_tokens": 1189984390.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 2029.990234375, "completions/mean_terminated_length": 203.8000030517578, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.3947731256484985, "epoch": 0.6336633663366337, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.209766036080562e-07, "loss": 0.0, "num_tokens": 1191098401.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "entropy": 7.4296875, "epoch": 0.6340047797883237, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.2046290114486993e-07, "loss": 0.0, "num_tokens": 1192228001.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.962890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1111.0, "completions/mean_length": 1987.048828125, "completions/mean_terminated_length": 405.52630615234375, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.3623268604278564, "epoch": 0.6343461932400136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.19949382675424e-07, "loss": 0.0, "num_tokens": 1193321722.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 1858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1821.0, "completions/mean_length": 2030.830078125, "completions/mean_terminated_length": 582.8333740234375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 7.428724765777588, "epoch": 0.6346876066917037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.1943604892911744e-07, "loss": 0.0, "num_tokens": 1194436867.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 2040.734375, "completions/mean_terminated_length": 188.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 7.414279818534851, "epoch": 0.6350290201433937, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.1892290063508596e-07, "loss": 0.0, "num_tokens": 1195563099.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 2027.16015625, "completions/mean_terminated_length": 269.66668701171875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 7.348275184631348, "epoch": 0.6353704335950836, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.1840993852220284e-07, "loss": 0.0, "num_tokens": 1196688861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1318.0, "completions/mean_length": 2028.056640625, "completions/mean_terminated_length": 589.2857666015625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.433235049247742, "epoch": 0.6357118470467736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.178971633190762e-07, "loss": 0.0, "num_tokens": 1197802362.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 2022.791015625, "completions/mean_terminated_length": 204.1428680419922, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.399407386779785, "epoch": 0.6360532604984637, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.173845757540493e-07, "loss": 0.0, "num_tokens": 1198915023.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 2036.310546875, "completions/mean_terminated_length": 53.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 7.4426774978637695, "epoch": 0.6363946739501536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.1687217655519813e-07, "loss": 0.0, "num_tokens": 1200034926.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 801.0, "completions/mean_length": 1990.283203125, "completions/mean_terminated_length": 406.27777099609375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 7.397755980491638, "epoch": 0.6367360874018436, "frac_reward_zero_std": 0.96875, "grad_norm": 0.05436472930884276, "learning_rate": 4.163599664503319e-07, "loss": 0.0024, "num_tokens": 1201140239.0, "reward": 0.01513671875, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01513671875, "rewards/tag_count_reward/std": 0.08503700792789459, "step": 1865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 2028.306640625, "completions/mean_terminated_length": 367.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 7.407334208488464, "epoch": 0.6370775008535337, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.158479461669905e-07, "loss": 0.0, "num_tokens": 1202253436.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 2036.8125, "completions/mean_terminated_length": 138.6666717529297, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 7.364789962768555, "epoch": 0.6374189143052236, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.1533611643244484e-07, "loss": 0.0, "num_tokens": 1203372300.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1214.0, "completions/mean_length": 2031.19140625, "completions/mean_terminated_length": 613.6666870117188, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 7.421345829963684, "epoch": 0.6377603277569136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.148244779736946e-07, "loss": 0.0, "num_tokens": 1204486782.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 2022.052734375, "completions/mean_terminated_length": 150.1428680419922, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.387491703033447, "epoch": 0.6381017412086036, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.143130315174683e-07, "loss": 0.0, "num_tokens": 1205600137.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.92578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1415.0, "completions/mean_length": 1951.55078125, "completions/mean_terminated_length": 748.4736938476562, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.160608172416687, "epoch": 0.6384431546602937, "frac_reward_zero_std": 0.96875, "grad_norm": 0.029663222498513938, "learning_rate": 4.138017777902214e-07, "loss": 0.0012, "num_tokens": 1206691059.0, "reward": 0.04248046875, "reward_std": 0.016129549592733383, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.10772226005792618, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.03076171875, "rewards/tag_count_reward/std": 0.11975188553333282, "step": 1870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 2025.689453125, "completions/mean_terminated_length": 144.1666717529297, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.406247615814209, "epoch": 0.6387845681119836, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.1329071751813606e-07, "loss": 0.0, "num_tokens": 1207809172.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 2040.447265625, "completions/mean_terminated_length": 114.5, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 7.390706777572632, "epoch": 0.6391259815636736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.127798514271187e-07, "loss": 0.0, "num_tokens": 1208928505.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1034.0, "completions/mean_length": 2034.6015625, "completions/mean_terminated_length": 333.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.46353542804718, "epoch": 0.6394673950153636, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.122691802428011e-07, "loss": 0.0, "num_tokens": 1210040397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1133.0, "completions/mean_length": 1991.275390625, "completions/mean_terminated_length": 595.8500366210938, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 7.270941376686096, "epoch": 0.6398088084670536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.117587046905372e-07, "loss": 0.0, "num_tokens": 1211139450.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 1874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1096.0, "completions/mean_length": 2019.30078125, "completions/mean_terminated_length": 211.25, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.339361906051636, "epoch": 0.6401502219187436, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.112484254954038e-07, "loss": 0.0, "num_tokens": 1212259348.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 2034.515625, "completions/mean_terminated_length": 322.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.372200012207031, "epoch": 0.6404916353704336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.1073834338219827e-07, "loss": 0.0, "num_tokens": 1213371548.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 2040.0390625, "completions/mean_terminated_length": 10.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.386895656585693, "epoch": 0.6408330488221236, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.1022845907543835e-07, "loss": 0.0, "num_tokens": 1214492496.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 2026.103515625, "completions/mean_terminated_length": 446.4285888671875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 7.4090259075164795, "epoch": 0.6411744622738136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.097187732993611e-07, "loss": 0.0, "num_tokens": 1215606421.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 2026.6953125, "completions/mean_terminated_length": 230.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.426654100418091, "epoch": 0.6415158757255036, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.0920928677792067e-07, "loss": 0.0, "num_tokens": 1216712089.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 2029.92578125, "completions/mean_terminated_length": 891.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.366235136985779, "epoch": 0.6418572891771935, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.08700000234789e-07, "loss": 0.0, "num_tokens": 1217836835.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1412.0, "completions/mean_length": 2032.009765625, "completions/mean_terminated_length": 683.5, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 7.408693790435791, "epoch": 0.6421987026288836, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.081909143933536e-07, "loss": 0.0, "num_tokens": 1218954152.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1820.0, "completions/mean_length": 2043.12890625, "completions/mean_terminated_length": 1216.666748046875, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "entropy": 7.455351233482361, "epoch": 0.6425401160805736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.076820299767173e-07, "loss": 0.0, "num_tokens": 1220081274.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 2031.634765625, "completions/mean_terminated_length": 372.20001220703125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 7.43413519859314, "epoch": 0.6428815295322635, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.0717334770769627e-07, "loss": 0.0, "num_tokens": 1221202863.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 2035.37109375, "completions/mean_terminated_length": 431.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 7.413004636764526, "epoch": 0.6432229429839535, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.066648683088203e-07, "loss": 0.0, "num_tokens": 1222322845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 2035.705078125, "completions/mean_terminated_length": 474.25, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "entropy": 7.385245203971863, "epoch": 0.6435643564356436, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.0615659250232993e-07, "loss": 0.0, "num_tokens": 1223446838.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 2038.66796875, "completions/mean_terminated_length": 455.3333435058594, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.422540664672852, "epoch": 0.6439057698873336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.0564852101017754e-07, "loss": 0.0, "num_tokens": 1224566908.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 2030.240234375, "completions/mean_terminated_length": 229.40000915527344, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.3841962814331055, "epoch": 0.6442471833390235, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.051406545540248e-07, "loss": 0.0, "num_tokens": 1225690583.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 2034.794921875, "completions/mean_terminated_length": 357.75, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 7.381127238273621, "epoch": 0.6445885967907136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.046329938552424e-07, "loss": 0.0, "num_tokens": 1226806222.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1112.0, "completions/mean_length": 2042.99609375, "completions/mean_terminated_length": 767.0, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "entropy": 7.4253833293914795, "epoch": 0.6449300102424036, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.041255396349085e-07, "loss": 0.0, "num_tokens": 1227929948.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 2041.181640625, "completions/mean_terminated_length": 302.5, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "entropy": 7.422350645065308, "epoch": 0.6452714236940935, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.036182926138082e-07, "loss": 0.0, "num_tokens": 1229045833.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1588.0, "completions/mean_length": 2043.19921875, "completions/mean_terminated_length": 819.0, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 7.415916919708252, "epoch": 0.6456128371457835, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.03111253512432e-07, "loss": 0.0, "num_tokens": 1230169775.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2027.244140625, "completions/mean_terminated_length": 1564.95458984375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 7.317931652069092, "epoch": 0.6459542505974736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.0260442305097574e-07, "loss": 0.0, "num_tokens": 1231286012.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 2036.306640625, "completions/mean_terminated_length": 52.333335876464844, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 7.399356484413147, "epoch": 0.6462956640491635, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.0209780194933796e-07, "loss": 0.0, "num_tokens": 1232411849.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1736.0, "completions/mean_length": 2037.84375, "completions/mean_terminated_length": 748.0, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 7.415790557861328, "epoch": 0.6466370775008535, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.015913909271207e-07, "loss": 0.0, "num_tokens": 1233528505.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 2044.0234375, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 7.3914453983306885, "epoch": 0.6469784909525436, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.010851907036268e-07, "loss": 0.0, "num_tokens": 1234655893.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.927734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1979.615234375, "completions/mean_terminated_length": 1101.7027587890625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.079140663146973, "epoch": 0.6473199044042335, "frac_reward_zero_std": 0.9375, "grad_norm": 0.045477837268425386, "learning_rate": 4.005792019978607e-07, "loss": 0.001, "num_tokens": 1235755424.0, "reward": 0.05517578125, "reward_std": 0.015746597200632095, "rewards/accuracy_reward/mean": 0.025390625, "rewards/accuracy_reward/std": 0.15746226906776428, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.02978515625, "rewards/tag_count_reward/std": 0.11690124869346619, "step": 1896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 2021.546875, "completions/mean_terminated_length": 355.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.40332818031311, "epoch": 0.6476613178559235, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.000734255285252e-07, "loss": 0.0, "num_tokens": 1236868280.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1735.0, "completions/mean_length": 2039.96484375, "completions/mean_terminated_length": 1019.5, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "entropy": 7.410162091255188, "epoch": 0.6480027313076135, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.995678620140227e-07, "loss": 0.0, "num_tokens": 1237987862.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1774.0, "completions/mean_length": 2040.30078125, "completions/mean_terminated_length": 1062.5, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 7.430483460426331, "epoch": 0.6483441447593035, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.9906251217245234e-07, "loss": 0.0, "num_tokens": 1239109200.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1732.0, "completions/mean_length": 2036.498046875, "completions/mean_terminated_length": 575.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 7.42235541343689, "epoch": 0.6486855582109935, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.985573767216104e-07, "loss": 0.0, "num_tokens": 1240232415.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 2028.998046875, "completions/mean_terminated_length": 102.20000457763672, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.397093176841736, "epoch": 0.6490269716626835, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.980524563789881e-07, "loss": 0.0, "num_tokens": 1241347118.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.962890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2039.1171875, "completions/mean_terminated_length": 1808.631591796875, "completions/min_length": 758.0, "completions/min_terminated_length": 758.0, "entropy": 7.203136324882507, "epoch": 0.6493683851143736, "frac_reward_zero_std": 0.96875, "grad_norm": 0.05570333799267831, "learning_rate": 3.975477518617716e-07, "loss": 0.0023, "num_tokens": 1242467322.0, "reward": 0.00927734375, "reward_std": 0.003149319440126419, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00927734375, "rewards/tag_count_reward/std": 0.05451139807701111, "step": 1902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1856.0, "completions/mean_length": 2042.697265625, "completions/mean_terminated_length": 1143.0, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "entropy": 7.443115830421448, "epoch": 0.6497097985660635, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.9704326388683994e-07, "loss": 0.0, "num_tokens": 1243592767.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 2044.341796875, "completions/mean_terminated_length": 175.0, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 7.358668804168701, "epoch": 0.6500512120177535, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.965389931707651e-07, "loss": 0.0, "num_tokens": 1244718798.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1069.0, "completions/mean_length": 2042.603515625, "completions/mean_terminated_length": 666.5, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 7.407035231590271, "epoch": 0.6503926254694435, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.9603494042981e-07, "loss": 0.0, "num_tokens": 1245842979.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 2033.923828125, "completions/mean_terminated_length": 246.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 7.418354868888855, "epoch": 0.6507340389211335, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.955311063799287e-07, "loss": 0.0, "num_tokens": 1246962172.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 2028.775390625, "completions/mean_terminated_length": 407.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 7.41735315322876, "epoch": 0.6510754523728235, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.950274917367638e-07, "loss": 0.0, "num_tokens": 1248079209.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1143.0, "completions/mean_length": 2030.43359375, "completions/mean_terminated_length": 249.1999969482422, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.406800150871277, "epoch": 0.6514168658245135, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.9452409721564686e-07, "loss": 0.0, "num_tokens": 1249190807.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.962890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2004.53515625, "completions/mean_terminated_length": 876.7368774414062, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.275188446044922, "epoch": 0.6517582792762034, "frac_reward_zero_std": 0.96875, "grad_norm": 0.1085878941927157, "learning_rate": 3.940209235315961e-07, "loss": 0.005, "num_tokens": 1250301721.0, "reward": 0.01513671875, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01513671875, "rewards/tag_count_reward/std": 0.08503700792789459, "step": 1909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 2033.341796875, "completions/mean_terminated_length": 797.1666870117188, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 7.351323962211609, "epoch": 0.6520996927278935, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.9351797139931684e-07, "loss": 0.0, "num_tokens": 1251421864.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1444.0, "completions/mean_length": 2046.8203125, "completions/mean_terminated_length": 1444.0, "completions/min_length": 1444.0, "completions/min_terminated_length": 1444.0, "entropy": 7.387140393257141, "epoch": 0.6524411061795835, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.930152415331992e-07, "loss": 0.0, "num_tokens": 1252545788.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1319.0, "completions/mean_length": 2036.41015625, "completions/mean_terminated_length": 564.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.457533359527588, "epoch": 0.6527825196312734, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.925127346473179e-07, "loss": 0.0, "num_tokens": 1253661438.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1326.0, "completions/mean_length": 2021.98046875, "completions/mean_terminated_length": 382.75, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.383142471313477, "epoch": 0.6531239330829635, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.9201045145543053e-07, "loss": 0.0, "num_tokens": 1254771668.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 2013.958984375, "completions/mean_terminated_length": 111.44444274902344, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.391179203987122, "epoch": 0.6534653465346535, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.9150839267097766e-07, "loss": 0.0, "num_tokens": 1255878351.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 2033.423828125, "completions/mean_terminated_length": 182.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 7.389848589897156, "epoch": 0.6538067599863434, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.9100655900708026e-07, "loss": 0.0, "num_tokens": 1256994664.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1105.0, "completions/mean_length": 2038.232421875, "completions/mean_terminated_length": 381.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 7.443139672279358, "epoch": 0.6541481734380334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.9050495117654e-07, "loss": 0.0, "num_tokens": 1258117103.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 2034.78125, "completions/mean_terminated_length": 356.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.427145957946777, "epoch": 0.6544895868897235, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.900035698918378e-07, "loss": 0.0, "num_tokens": 1259235375.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 2016.3125, "completions/mean_terminated_length": 245.3333282470703, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.393644094467163, "epoch": 0.6548310003414135, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.895024158651329e-07, "loss": 0.0, "num_tokens": 1260345599.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1067.0, "completions/mean_length": 2015.599609375, "completions/mean_terminated_length": 204.7777862548828, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.416583061218262, "epoch": 0.6551724137931034, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.8900148980826097e-07, "loss": 0.0, "num_tokens": 1261457138.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 2026.919921875, "completions/mean_terminated_length": 249.1666717529297, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.400792837142944, "epoch": 0.6555138272447935, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.8850079243273514e-07, "loss": 0.0, "num_tokens": 1262570393.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1243.0, "completions/mean_length": 2029.5078125, "completions/mean_terminated_length": 470.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.439262270927429, "epoch": 0.6558552406964835, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.880003244497427e-07, "loss": 0.0, "num_tokens": 1263686029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 691.0, "completions/mean_length": 2034.212890625, "completions/mean_terminated_length": 283.25, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.408461928367615, "epoch": 0.6561966541481734, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.8750008657014554e-07, "loss": 0.0, "num_tokens": 1264808314.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1084.0, "completions/mean_length": 2038.736328125, "completions/mean_terminated_length": 467.0, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 7.366876244544983, "epoch": 0.6565380675998634, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.8700007950447856e-07, "loss": 0.0, "num_tokens": 1265944003.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 2030.24609375, "completions/mean_terminated_length": 230.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.38597571849823, "epoch": 0.6568794810515535, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.865003039629491e-07, "loss": 0.0, "num_tokens": 1267054177.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 2026.763671875, "completions/mean_terminated_length": 235.83334350585938, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 7.372243642807007, "epoch": 0.6572208945032434, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.86000760655435e-07, "loss": 0.0, "num_tokens": 1268167272.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 2029.93359375, "completions/mean_terminated_length": 198.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.383281826972961, "epoch": 0.6575623079549334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.855014502914851e-07, "loss": 0.0, "num_tokens": 1269282502.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 2027.2109375, "completions/mean_terminated_length": 527.4285888671875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 7.424092769622803, "epoch": 0.6579037214066235, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.850023735803166e-07, "loss": 0.0, "num_tokens": 1270394738.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 2033.884765625, "completions/mean_terminated_length": 241.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.378316640853882, "epoch": 0.6582451348583134, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.8450353123081545e-07, "loss": 0.0, "num_tokens": 1271524951.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 2033.044921875, "completions/mean_terminated_length": 133.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.417188048362732, "epoch": 0.6585865483100034, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.8400492395153417e-07, "loss": 0.0, "num_tokens": 1272649374.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 2031.8671875, "completions/mean_terminated_length": 396.0, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 7.450875878334045, "epoch": 0.6589279617616934, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.8350655245069184e-07, "loss": 0.0, "num_tokens": 1273768362.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 2044.130859375, "completions/mean_terminated_length": 67.0, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 7.332497835159302, "epoch": 0.6592693752133834, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.8300841743617227e-07, "loss": 0.0, "num_tokens": 1274894445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 2040.353515625, "completions/mean_terminated_length": 90.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 7.329312086105347, "epoch": 0.6596107886650734, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.8251051961552373e-07, "loss": 0.0, "num_tokens": 1276012242.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1157.0, "completions/mean_length": 2042.60546875, "completions/mean_terminated_length": 667.0, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 7.357151508331299, "epoch": 0.6599522021167634, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.8201285969595696e-07, "loss": 0.0, "num_tokens": 1277133688.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1055.0, "completions/mean_length": 2033.775390625, "completions/mean_terminated_length": 591.4000244140625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.413524389266968, "epoch": 0.6602936155684535, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.8151543838434566e-07, "loss": 0.0, "num_tokens": 1278250085.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1781.0, "completions/mean_length": 2041.861328125, "completions/mean_terminated_length": 1000.3333740234375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 7.412185549736023, "epoch": 0.6606350290201434, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.8101825638722395e-07, "loss": 0.0, "num_tokens": 1279372158.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1080.0, "completions/mean_length": 2037.51953125, "completions/mean_terminated_length": 706.5, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "entropy": 7.357915282249451, "epoch": 0.6609764424718334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.805213144107865e-07, "loss": 0.0, "num_tokens": 1280492312.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1299.0, "completions/mean_length": 2035.904296875, "completions/mean_terminated_length": 499.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.430601358413696, "epoch": 0.6613178559235234, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.800246131608863e-07, "loss": 0.0, "num_tokens": 1281610951.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 2036.337890625, "completions/mean_terminated_length": 57.66666793823242, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.375114560127258, "epoch": 0.6616592693752134, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.7952815334303535e-07, "loss": 0.0, "num_tokens": 1282737188.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 2032.98046875, "completions/mean_terminated_length": 125.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.408649206161499, "epoch": 0.6620006828269034, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.790319356624021e-07, "loss": 0.0, "num_tokens": 1283852074.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 2044.435546875, "completions/mean_terminated_length": 223.0, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "entropy": 7.350389242172241, "epoch": 0.6623420962785934, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.7853596082381134e-07, "loss": 0.0, "num_tokens": 1284968601.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1328.0, "completions/mean_length": 2035.455078125, "completions/mean_terminated_length": 763.4000244140625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 7.412457823753357, "epoch": 0.6626835097302833, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.780402295317426e-07, "loss": 0.0, "num_tokens": 1286084146.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 2027.1953125, "completions/mean_terminated_length": 272.66668701171875, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.417264223098755, "epoch": 0.6630249231819734, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.775447424903302e-07, "loss": 0.0, "num_tokens": 1287191110.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1791.0, "completions/mean_length": 1996.693359375, "completions/mean_terminated_length": 853.95458984375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 7.242157697677612, "epoch": 0.6633663366336634, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.770495004033606e-07, "loss": 0.0, "num_tokens": 1288296249.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 1943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 2029.921875, "completions/mean_terminated_length": 196.8000030517578, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.440352916717529, "epoch": 0.6637077500853533, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.76554503974273e-07, "loss": 0.0, "num_tokens": 1289406001.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 2033.3515625, "completions/mean_terminated_length": 173.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.373424053192139, "epoch": 0.6640491635370434, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.7605975390615717e-07, "loss": 0.0, "num_tokens": 1290522853.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 2038.791015625, "completions/mean_terminated_length": 476.3333435058594, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.400471091270447, "epoch": 0.6643905769887334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.755652509017536e-07, "loss": 0.0, "num_tokens": 1291647866.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2033.806640625, "completions/mean_terminated_length": 1684.6500244140625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 7.121656060218811, "epoch": 0.6647319904404233, "frac_reward_zero_std": 0.96875, "grad_norm": 0.03133016890896765, "learning_rate": 3.7507099566345125e-07, "loss": -0.0005, "num_tokens": 1292769287.0, "reward": 0.0087890625, "reward_std": 0.002668476663529873, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0087890625, "rewards/tag_count_reward/std": 0.051121458411216736, "step": 1947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 2035.015625, "completions/mean_terminated_length": 386.0, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 7.348856449127197, "epoch": 0.6650734038921133, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.745769888932876e-07, "loss": 0.0, "num_tokens": 1293888735.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 2037.146484375, "completions/mean_terminated_length": 195.6666717529297, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.413003921508789, "epoch": 0.6654148173438034, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.740832312929465e-07, "loss": 0.0, "num_tokens": 1295009914.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1236.0, "completions/mean_length": 2023.455078125, "completions/mean_terminated_length": 252.71429443359375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.432119131088257, "epoch": 0.6657562307954934, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.73589723563759e-07, "loss": 0.0, "num_tokens": 1296128339.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 1993.3046875, "completions/mean_terminated_length": 647.7999877929688, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.287482857704163, "epoch": 0.6660976442471833, "frac_reward_zero_std": 0.96875, "grad_norm": 0.050775121149165325, "learning_rate": 3.7309646640670003e-07, "loss": 0.0004, "num_tokens": 1297225199.0, "reward": 0.04296875, "reward_std": 0.010673906654119492, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.16324250400066376, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 1951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.947265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1962.15625, "completions/mean_terminated_length": 420.1481628417969, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.334058523178101, "epoch": 0.6664390576988733, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.7260346052238967e-07, "loss": 0.0, "num_tokens": 1298307727.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 1952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1111.0, "completions/mean_length": 2034.9296875, "completions/mean_terminated_length": 375.0, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 7.387786507606506, "epoch": 0.6667804711505634, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.721107066110901e-07, "loss": 0.0, "num_tokens": 1299427707.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 2044.0625, "completions/mean_terminated_length": 32.0, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 7.423264145851135, "epoch": 0.6671218846022533, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.716182053727067e-07, "loss": 0.0, "num_tokens": 1300560555.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1225.0, "completions/mean_length": 2036.4296875, "completions/mean_terminated_length": 567.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 7.3923187255859375, "epoch": 0.6674632980539433, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.7112595750678486e-07, "loss": 0.0, "num_tokens": 1301680935.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 2035.251953125, "completions/mean_terminated_length": 416.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.386077046394348, "epoch": 0.6678047115056334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.706339637125109e-07, "loss": 0.0, "num_tokens": 1302800312.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1126.0, "completions/mean_length": 2033.0859375, "completions/mean_terminated_length": 520.7999877929688, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 7.347700834274292, "epoch": 0.6681461249573233, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.701422246887096e-07, "loss": 0.0, "num_tokens": 1303918628.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1972.0, "completions/mean_length": 2040.732421875, "completions/mean_terminated_length": 1117.75, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "entropy": 7.412621021270752, "epoch": 0.6684875384090133, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.6965074113384467e-07, "loss": 0.0, "num_tokens": 1305036267.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 2028.810546875, "completions/mean_terminated_length": 83.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.484280586242676, "epoch": 0.6688289518607033, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.6915951374601584e-07, "loss": 0.0, "num_tokens": 1306147818.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 2030.03125, "completions/mean_terminated_length": 208.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.424261212348938, "epoch": 0.6691703653123933, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.686685432229604e-07, "loss": 0.0, "num_tokens": 1307275274.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 2038.494140625, "completions/mean_terminated_length": 425.66668701171875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 7.384943962097168, "epoch": 0.6695117787640833, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.681778302620494e-07, "loss": 0.0, "num_tokens": 1308398183.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1034.0, "completions/mean_length": 2039.337890625, "completions/mean_terminated_length": 569.6666870117188, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 7.425387263298035, "epoch": 0.6698531922157733, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.6768737556028904e-07, "loss": 0.0, "num_tokens": 1309516244.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "entropy": 7.375, "epoch": 0.6701946056674633, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.671971798143181e-07, "loss": 0.0, "num_tokens": 1310639556.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 2023.59375, "completions/mean_terminated_length": 262.8571472167969, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.400803446769714, "epoch": 0.6705360191191533, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.6670724372040796e-07, "loss": 0.0, "num_tokens": 1311753732.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1703.0, "completions/mean_length": 2043.333984375, "completions/mean_terminated_length": 853.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.421151399612427, "epoch": 0.6708774325708433, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.6621756797446066e-07, "loss": 0.0, "num_tokens": 1312884927.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1319.0, "completions/mean_length": 2030.1875, "completions/mean_terminated_length": 528.0, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 7.410282254219055, "epoch": 0.6712188460225332, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.6572815327200933e-07, "loss": 0.0, "num_tokens": 1313995407.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 2018.173828125, "completions/mean_terminated_length": 139.125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.32131814956665, "epoch": 0.6715602594742233, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.652390003082151e-07, "loss": 0.0, "num_tokens": 1315107928.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 2027.630859375, "completions/mean_terminated_length": 558.1428833007812, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 7.448794484138489, "epoch": 0.6719016729259133, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.647501097778685e-07, "loss": 0.0, "num_tokens": 1316220683.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1097.0, "completions/mean_length": 2023.462890625, "completions/mean_terminated_length": 477.625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 7.432454586029053, "epoch": 0.6722430863776033, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.6426148237538656e-07, "loss": 0.0, "num_tokens": 1317335240.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1044.0, "completions/mean_length": 2026.74609375, "completions/mean_terminated_length": 493.4285888671875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.4020620584487915, "epoch": 0.6725844998292932, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.6377311879481296e-07, "loss": 0.0, "num_tokens": 1318458166.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 2045.357421875, "completions/mean_terminated_length": 695.0, "completions/min_length": 695.0, "completions/min_terminated_length": 695.0, "entropy": 7.411891222000122, "epoch": 0.6729259132809833, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.632850197298161e-07, "loss": 0.0, "num_tokens": 1319575181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 2029.76171875, "completions/mean_terminated_length": 180.40000915527344, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.413066744804382, "epoch": 0.6732673267326733, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.6279718587368955e-07, "loss": 0.0, "num_tokens": 1320696627.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 2042.451171875, "completions/mean_terminated_length": 627.5, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "entropy": 7.404646873474121, "epoch": 0.6736087401843632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.6230961791934934e-07, "loss": 0.0, "num_tokens": 1321821642.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 2036.3359375, "completions/mean_terminated_length": 555.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.389702320098877, "epoch": 0.6739501536360533, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.6182231655933437e-07, "loss": 0.0, "num_tokens": 1322950566.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 2029.5390625, "completions/mean_terminated_length": 157.60000610351562, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.399533748626709, "epoch": 0.6742915670877433, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.613352824858044e-07, "loss": 0.0, "num_tokens": 1324072554.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1991.75390625, "completions/mean_terminated_length": 739.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.277282118797302, "epoch": 0.6746329805394332, "frac_reward_zero_std": 0.96875, "grad_norm": 0.09041994734773448, "learning_rate": 3.6084851639054e-07, "loss": 0.0039, "num_tokens": 1325176652.0, "reward": 0.01513671875, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01513671875, "rewards/tag_count_reward/std": 0.08503700792789459, "step": 1976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 2040.11328125, "completions/mean_terminated_length": 29.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 7.39578640460968, "epoch": 0.6749743939911232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.603620189649408e-07, "loss": 0.0, "num_tokens": 1326297654.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 2038.83984375, "completions/mean_terminated_length": 484.66668701171875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.436412334442139, "epoch": 0.6753158074428133, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.5987579090002496e-07, "loss": 0.0, "num_tokens": 1327416068.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 2029.89453125, "completions/mean_terminated_length": 194.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.4748430252075195, "epoch": 0.6756572208945032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.593898328864279e-07, "loss": 0.0, "num_tokens": 1328524830.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1304.0, "completions/mean_length": 2027.0234375, "completions/mean_terminated_length": 513.7142944335938, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.403495907783508, "epoch": 0.6759986343461932, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.589041456144017e-07, "loss": 0.0, "num_tokens": 1329637354.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1885.0, "completions/mean_length": 1995.564453125, "completions/mean_terminated_length": 705.6500244140625, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "entropy": 7.318100690841675, "epoch": 0.6763400477978833, "frac_reward_zero_std": 0.96875, "grad_norm": 0.02683657801925779, "learning_rate": 3.584187297738136e-07, "loss": -0.0013, "num_tokens": 1330749403.0, "reward": 0.017578125, "reward_std": 0.0078125, "rewards/accuracy_reward/mean": 0.001953125, "rewards/accuracy_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 1981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 2044.88671875, "completions/mean_terminated_length": 1251.0, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "entropy": 7.426356792449951, "epoch": 0.6766814612495732, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.579335860541456e-07, "loss": 0.0, "num_tokens": 1331869009.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 2037.666015625, "completions/mean_terminated_length": 284.3333435058594, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 7.379222869873047, "epoch": 0.6770228747012632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.574487151444927e-07, "loss": 0.0, "num_tokens": 1332996982.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 2040.662109375, "completions/mean_terminated_length": 169.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 7.383442997932434, "epoch": 0.6773642881529532, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.5696411773356303e-07, "loss": 0.0, "num_tokens": 1334116521.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1142.0, "completions/mean_length": 2025.10546875, "completions/mean_terminated_length": 373.4285888671875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.418179035186768, "epoch": 0.6777057016046433, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.5647979450967557e-07, "loss": 0.0, "num_tokens": 1335229231.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.962890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1998.42578125, "completions/mean_terminated_length": 712.1052856445312, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.261321067810059, "epoch": 0.6780471150563332, "frac_reward_zero_std": 0.96875, "grad_norm": 0.33549558447626526, "learning_rate": 3.559957461607608e-07, "loss": 0.0113, "num_tokens": 1336333561.0, "reward": 0.013671875, "reward_std": 0.0034938561730086803, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.013671875, "rewards/tag_count_reward/std": 0.07856711745262146, "step": 1986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1055.0, "completions/mean_length": 2020.89453125, "completions/mean_terminated_length": 313.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.3902469873428345, "epoch": 0.6783885285080232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.555119733743576e-07, "loss": 0.0, "num_tokens": 1337449539.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1797.0, "completions/mean_length": 2030.248046875, "completions/mean_terminated_length": 749.5714721679688, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 7.4159910678863525, "epoch": 0.6787299419597133, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.5502847683761426e-07, "loss": 0.0, "num_tokens": 1338565762.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1025.0, "completions/mean_length": 2035.02734375, "completions/mean_terminated_length": 387.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.447065830230713, "epoch": 0.6790713554114032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.5454525723728644e-07, "loss": 0.0, "num_tokens": 1339685600.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1186.0, "completions/mean_length": 2042.3359375, "completions/mean_terminated_length": 598.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.332492709159851, "epoch": 0.6794127688630932, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.5406231525973653e-07, "loss": 0.0, "num_tokens": 1340814604.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 2032.384765625, "completions/mean_terminated_length": 49.25, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.372485160827637, "epoch": 0.6797541823147832, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.535796515909319e-07, "loss": 0.0, "num_tokens": 1341935937.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 2024.56640625, "completions/mean_terminated_length": 334.0, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 7.44790256023407, "epoch": 0.6800955957664732, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.530972669164463e-07, "loss": 0.0, "num_tokens": 1343043139.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 2030.876953125, "completions/mean_terminated_length": 294.6000061035156, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.408234715461731, "epoch": 0.6804370092181632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.5261516192145523e-07, "loss": 0.0, "num_tokens": 1344162772.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 2026.89453125, "completions/mean_terminated_length": 247.0, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 7.400475740432739, "epoch": 0.6807784226698532, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.5213333729073823e-07, "loss": 0.0, "num_tokens": 1345282862.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2044.00390625, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.393604516983032, "epoch": 0.6811198361215431, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.5165179370867593e-07, "loss": 0.0, "num_tokens": 1346403552.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 2040.529296875, "completions/mean_terminated_length": 135.5, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.453465819358826, "epoch": 0.6814612495732332, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.511705318592504e-07, "loss": 0.0, "num_tokens": 1347529247.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1079.0, "completions/mean_length": 2032.59765625, "completions/mean_terminated_length": 470.8000183105469, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 7.410250186920166, "epoch": 0.6818026630249232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.506895524260426e-07, "loss": 0.0, "num_tokens": 1348645601.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 2041.0703125, "completions/mean_terminated_length": 274.0, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 7.428584814071655, "epoch": 0.6821440764766131, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.502088560922335e-07, "loss": 0.0, "num_tokens": 1349759429.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 2036.365234375, "completions/mean_terminated_length": 558.75, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 7.397495746612549, "epoch": 0.6824854899283032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4972844354060095e-07, "loss": 0.0, "num_tokens": 1350882288.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.966796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1218.0, "completions/mean_length": 2005.751953125, "completions/mean_terminated_length": 775.5882568359375, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "entropy": 7.304277062416077, "epoch": 0.6828269033799932, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.492483154535205e-07, "loss": 0.0, "num_tokens": 1351992753.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1049.0, "completions/mean_length": 2034.654296875, "completions/mean_terminated_length": 339.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 7.419494152069092, "epoch": 0.6831683168316832, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4876847251296287e-07, "loss": 0.0, "num_tokens": 1353118672.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 2034.734375, "completions/mean_terminated_length": 689.6000366210938, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 7.374017953872681, "epoch": 0.6835097302833731, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4828891540049463e-07, "loss": 0.0, "num_tokens": 1354240776.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1358.0, "completions/mean_length": 2039.177734375, "completions/mean_terminated_length": 542.3333740234375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 7.412243604660034, "epoch": 0.6838511437350632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.478096447972756e-07, "loss": 0.0, "num_tokens": 1355364371.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 2040.837890625, "completions/mean_terminated_length": 214.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 7.432264566421509, "epoch": 0.6841925571867532, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.473306613840589e-07, "loss": 0.0, "num_tokens": 1356489248.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.955078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2013.798828125, "completions/mean_terminated_length": 1286.6522216796875, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 7.207759737968445, "epoch": 0.6845339706384431, "frac_reward_zero_std": 0.96875, "grad_norm": 0.22394595509938642, "learning_rate": 3.4685196584119e-07, "loss": 0.0058, "num_tokens": 1357611609.0, "reward": 0.01123046875, "reward_std": 0.004002714995294809, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01123046875, "rewards/tag_count_reward/std": 0.06632548570632935, "step": 2005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 2032.96875, "completions/mean_terminated_length": 124.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.386974096298218, "epoch": 0.6848753840901332, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.463735588486053e-07, "loss": 0.0, "num_tokens": 1358729561.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 2041.63671875, "completions/mean_terminated_length": 419.0, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "entropy": 7.396572947502136, "epoch": 0.6852167975418232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.45895441085831e-07, "loss": 0.0, "num_tokens": 1359858271.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1321.0, "completions/mean_length": 2039.56640625, "completions/mean_terminated_length": 608.6666870117188, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 7.401378273963928, "epoch": 0.6855582109935131, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4541761323198295e-07, "loss": 0.0, "num_tokens": 1360978545.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.955078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1998.794921875, "completions/mean_terminated_length": 952.6522216796875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.238917231559753, "epoch": 0.6858996244452031, "frac_reward_zero_std": 0.96875, "grad_norm": 0.1769471377777299, "learning_rate": 3.449400759657653e-07, "loss": 0.0054, "num_tokens": 1362086696.0, "reward": 0.0146484375, "reward_std": 0.002668476663529873, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0146484375, "rewards/tag_count_reward/std": 0.08293935656547546, "step": 2009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1790.0, "completions/mean_length": 2043.5, "completions/mean_terminated_length": 896.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.337272763252258, "epoch": 0.6862410378968932, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4446282996546853e-07, "loss": 0.0, "num_tokens": 1363213720.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1160.0, "completions/mean_length": 2034.90625, "completions/mean_terminated_length": 707.2000122070312, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 7.413065671920776, "epoch": 0.6865824513485831, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.439858759089709e-07, "loss": 0.0, "num_tokens": 1364335864.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 2040.5625, "completions/mean_terminated_length": 778.6666870117188, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 7.430390000343323, "epoch": 0.6869238648002731, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.435092144737346e-07, "loss": 0.0, "num_tokens": 1365453912.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.966796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 2002.609375, "completions/mean_terminated_length": 680.941162109375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 7.338928461074829, "epoch": 0.6872652782519632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4303284633680716e-07, "loss": 0.0, "num_tokens": 1366564880.0, "reward": 0.046875, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1334.0, "completions/mean_length": 1989.046875, "completions/mean_terminated_length": 538.7999877929688, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.3063448667526245, "epoch": 0.6876066917036531, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.425567721748187e-07, "loss": 0.0, "num_tokens": 1367661704.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 2034.810546875, "completions/mean_terminated_length": 359.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.434463143348694, "epoch": 0.6879481051553431, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.420809926639825e-07, "loss": 0.0, "num_tokens": 1368773319.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 2041.908203125, "completions/mean_terminated_length": 488.5, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 7.364768028259277, "epoch": 0.6882895186070331, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.416055084800927e-07, "loss": 0.0, "num_tokens": 1369900504.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1517.0, "completions/mean_length": 2046.962890625, "completions/mean_terminated_length": 1517.0, "completions/min_length": 1517.0, "completions/min_terminated_length": 1517.0, "entropy": 7.435388565063477, "epoch": 0.6886309320587232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.411303202985245e-07, "loss": 0.0, "num_tokens": 1371027301.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1107.0, "completions/mean_length": 2020.04296875, "completions/mean_terminated_length": 258.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.397395849227905, "epoch": 0.6889723455104131, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.406554287942324e-07, "loss": 0.0, "num_tokens": 1372135995.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 2044.056640625, "completions/mean_terminated_length": 29.0, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 7.43897557258606, "epoch": 0.6893137589621031, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.401808346417495e-07, "loss": 0.0, "num_tokens": 1373269272.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 2030.896484375, "completions/mean_terminated_length": 296.6000061035156, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 7.3575907945632935, "epoch": 0.6896551724137931, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3970653851518657e-07, "loss": 0.0, "num_tokens": 1374377731.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.962890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2042.037109375, "completions/mean_terminated_length": 1887.3157958984375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.325157403945923, "epoch": 0.6899965858654831, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3923254108823114e-07, "loss": 0.0, "num_tokens": 1375504806.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1182.0, "completions/mean_length": 2026.759765625, "completions/mean_terminated_length": 494.4285888671875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 7.412942171096802, "epoch": 0.6903379993171731, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.387588430341461e-07, "loss": 0.0, "num_tokens": 1376613835.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 2032.28125, "completions/mean_terminated_length": 438.3999938964844, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 7.431549549102783, "epoch": 0.6906794127688631, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.382854450257696e-07, "loss": 0.0, "num_tokens": 1377721179.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 2035.767578125, "completions/mean_terminated_length": 795.4000244140625, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "entropy": 7.381094932556152, "epoch": 0.691020826220553, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.378123477355135e-07, "loss": 0.0, "num_tokens": 1378842308.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 2035.34765625, "completions/mean_terminated_length": 428.5, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 7.441980361938477, "epoch": 0.6913622396722431, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.373395518353625e-07, "loss": 0.0, "num_tokens": 1379967094.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 2031.001953125, "completions/mean_terminated_length": 307.3999938964844, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 7.321754336357117, "epoch": 0.6917036531239331, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3686705799687285e-07, "loss": 0.0, "num_tokens": 1381082247.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 2028.490234375, "completions/mean_terminated_length": 621.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.360340356826782, "epoch": 0.692045066575623, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3639486689117234e-07, "loss": 0.0, "num_tokens": 1382192322.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1916.0, "completions/mean_length": 2031.197265625, "completions/mean_terminated_length": 614.1666870117188, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 7.413567066192627, "epoch": 0.6923864800273131, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3592297918895824e-07, "loss": 0.0, "num_tokens": 1383319175.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.955078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2006.71875, "completions/mean_terminated_length": 1129.04345703125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.153731346130371, "epoch": 0.6927278934790031, "frac_reward_zero_std": 0.96875, "grad_norm": 0.2200361972869508, "learning_rate": 3.354513955604971e-07, "loss": 0.0042, "num_tokens": 1384436711.0, "reward": 0.0185546875, "reward_std": 0.014511074870824814, "rewards/accuracy_reward/mean": 0.005859375, "rewards/accuracy_reward/std": 0.07639661431312561, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0126953125, "rewards/tag_count_reward/std": 0.07392385601997375, "step": 2029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1036.0, "completions/mean_length": 2039.041015625, "completions/mean_terminated_length": 519.0, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 7.435737133026123, "epoch": 0.693069306930693, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3498011667562365e-07, "loss": 0.0, "num_tokens": 1385555804.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1715.0, "completions/mean_length": 2041.857421875, "completions/mean_terminated_length": 999.6666870117188, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 7.398351669311523, "epoch": 0.693410720382383, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.345091432037398e-07, "loss": 0.0, "num_tokens": 1386684483.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1630.0, "completions/mean_length": 2033.64453125, "completions/mean_terminated_length": 578.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 7.44501805305481, "epoch": 0.6937521338340731, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.340384758138133e-07, "loss": 0.0, "num_tokens": 1387791325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.962890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1072.0, "completions/mean_length": 1991.01953125, "completions/mean_terminated_length": 512.5263061523438, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.324233293533325, "epoch": 0.6940935472857631, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.335681151743775e-07, "loss": 0.0, "num_tokens": 1388898215.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1344.0, "completions/mean_length": 2031.9140625, "completions/mean_terminated_length": 675.3333740234375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 7.439288139343262, "epoch": 0.694434960737453, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3309806195352976e-07, "loss": 0.0, "num_tokens": 1390010683.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 2038.34375, "completions/mean_terminated_length": 400.0, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 7.358760237693787, "epoch": 0.6947763741891431, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.32628316818931e-07, "loss": 0.0, "num_tokens": 1391132859.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 2024.7734375, "completions/mean_terminated_length": 66.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.438452839851379, "epoch": 0.6951177876408331, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3215888043780453e-07, "loss": 0.0, "num_tokens": 1392244263.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1775.0, "completions/mean_length": 2047.466796875, "completions/mean_terminated_length": 1775.0, "completions/min_length": 1775.0, "completions/min_terminated_length": 1775.0, "entropy": 7.388671398162842, "epoch": 0.695459201092523, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3168975347693517e-07, "loss": 0.0, "num_tokens": 1393376534.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 2029.443359375, "completions/mean_terminated_length": 147.8000030517578, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.3933327198028564, "epoch": 0.695800614544213, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3122093660266794e-07, "loss": 0.0, "num_tokens": 1394491913.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2016.349609375, "completions/mean_terminated_length": 1276.3333740234375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.200083374977112, "epoch": 0.6961420279959031, "frac_reward_zero_std": 0.96875, "grad_norm": 0.24505984082763757, "learning_rate": 3.3075243048090766e-07, "loss": 0.0023, "num_tokens": 1395605068.0, "reward": 0.0126953125, "reward_std": 0.008005429990589619, "rewards/accuracy_reward/mean": 0.002016128972172737, "rewards/accuracy_reward/std": 0.044901326298713684, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0107421875, "rewards/tag_count_reward/std": 0.06358373910188675, "step": 2039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 2033.001953125, "completions/mean_terminated_length": 128.25, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.405880928039551, "epoch": 0.696483441447593, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3028423577711755e-07, "loss": 0.0, "num_tokens": 1396723453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 2044.146484375, "completions/mean_terminated_length": 75.0, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 7.408108353614807, "epoch": 0.696824854899283, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2981635315631885e-07, "loss": 0.0, "num_tokens": 1397851544.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 2036.484375, "completions/mean_terminated_length": 82.66667175292969, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 7.361349582672119, "epoch": 0.6971662683509731, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.293487832830891e-07, "loss": 0.0, "num_tokens": 1398975488.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1081.0, "completions/mean_length": 2029.005859375, "completions/mean_terminated_length": 427.16668701171875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.3772571086883545, "epoch": 0.697507681802663, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.288815268215622e-07, "loss": 0.0, "num_tokens": 1400088003.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1721.0, "completions/mean_length": 2041.2734375, "completions/mean_terminated_length": 900.0, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 7.425015330314636, "epoch": 0.697849095254353, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2841458443542604e-07, "loss": 0.0, "num_tokens": 1401207551.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1232.0, "completions/mean_length": 2040.646484375, "completions/mean_terminated_length": 793.0, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 7.3390398025512695, "epoch": 0.698190508706043, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.279479567879232e-07, "loss": 0.0, "num_tokens": 1402333898.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1179.0, "completions/mean_length": 2028.994140625, "completions/mean_terminated_length": 426.16668701171875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 7.395500540733337, "epoch": 0.698531922157733, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2748164454184867e-07, "loss": 0.0, "num_tokens": 1403453127.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 2044.35546875, "completions/mean_terminated_length": 182.0, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 7.444849729537964, "epoch": 0.698873335609423, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.270156483595496e-07, "loss": 0.0, "num_tokens": 1404573021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1403.0, "completions/mean_length": 2041.18359375, "completions/mean_terminated_length": 884.6666870117188, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "entropy": 7.413578033447266, "epoch": 0.699214749061113, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2654996890292453e-07, "loss": 0.0, "num_tokens": 1405692347.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 2018.998046875, "completions/mean_terminated_length": 563.1000366210938, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.392824649810791, "epoch": 0.6995561625128031, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.260846068334218e-07, "loss": 0.0, "num_tokens": 1406800186.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 2037.173828125, "completions/mean_terminated_length": 200.33334350585938, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 7.4415717124938965, "epoch": 0.699897575964493, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.256195628120387e-07, "loss": 0.0, "num_tokens": 1407919235.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 2031.921875, "completions/mean_terminated_length": 676.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 7.4049293994903564, "epoch": 0.700238989416183, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2515483749932136e-07, "loss": 0.0, "num_tokens": 1409033195.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2018.380859375, "completions/mean_terminated_length": 1416.125, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 7.311751365661621, "epoch": 0.700580402867873, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2469043155536266e-07, "loss": 0.0, "num_tokens": 1410142894.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1738.0, "completions/mean_length": 2034.568359375, "completions/mean_terminated_length": 672.6000366210938, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.377273082733154, "epoch": 0.700921816319563, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.242263456398022e-07, "loss": 0.0, "num_tokens": 1411260913.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "entropy": 7.4248046875, "epoch": 0.701263229771253, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.237625804118249e-07, "loss": 0.0, "num_tokens": 1412386385.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 2030.296875, "completions/mean_terminated_length": 235.1999969482422, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.335471272468567, "epoch": 0.701604643222943, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.232991365301604e-07, "loss": 0.0, "num_tokens": 1413504601.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1104.0, "completions/mean_length": 2025.904296875, "completions/mean_terminated_length": 431.8571472167969, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.428757548332214, "epoch": 0.701946056674633, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2283601465308135e-07, "loss": 0.0, "num_tokens": 1414613624.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1201.0, "completions/mean_length": 2040.517578125, "completions/mean_terminated_length": 771.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.393736124038696, "epoch": 0.702287470126323, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2237321543840367e-07, "loss": 0.0, "num_tokens": 1415734113.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.955078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1992.056640625, "completions/mean_terminated_length": 802.6521606445312, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.250601291656494, "epoch": 0.702628883578013, "frac_reward_zero_std": 0.96875, "grad_norm": 0.06344916892141395, "learning_rate": 3.219107395434843e-07, "loss": 0.0062, "num_tokens": 1416834366.0, "reward": 0.0146484375, "reward_std": 0.002668476663529873, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0146484375, "rewards/tag_count_reward/std": 0.08293935656547546, "step": 2058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 2040.1796875, "completions/mean_terminated_length": 46.0, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 7.382519245147705, "epoch": 0.7029702970297029, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2144858762522156e-07, "loss": 0.0, "num_tokens": 1417952010.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1202.0, "completions/mean_length": 2035.392578125, "completions/mean_terminated_length": 434.25, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 7.4112865924835205, "epoch": 0.703311710481393, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2098676034005325e-07, "loss": 0.0, "num_tokens": 1419073683.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.966796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1064.0, "completions/mean_length": 2003.44921875, "completions/mean_terminated_length": 706.2352905273438, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.269687175750732, "epoch": 0.703653123933083, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.205252583439564e-07, "loss": 0.0, "num_tokens": 1420189513.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 2030.6796875, "completions/mean_terminated_length": 274.3999938964844, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 7.381300687789917, "epoch": 0.7039945373847729, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.200640822924453e-07, "loss": 0.0, "num_tokens": 1421302837.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1862.0, "completions/mean_length": 2035.853515625, "completions/mean_terminated_length": 493.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.372917294502258, "epoch": 0.7043359508364629, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1960323284057226e-07, "loss": 0.0, "num_tokens": 1422425258.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1683.0, "completions/mean_length": 2002.46484375, "completions/mean_terminated_length": 882.2999877929688, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 7.2645522356033325, "epoch": 0.704677364288153, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1914271064292476e-07, "loss": 0.0, "num_tokens": 1423535704.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 2031.1953125, "completions/mean_terminated_length": 327.20001220703125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 7.36195707321167, "epoch": 0.7050187777398429, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1868251635362584e-07, "loss": 0.0, "num_tokens": 1424651388.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 2044.04296875, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 7.408947587013245, "epoch": 0.7053601911915329, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1822265062633304e-07, "loss": 0.0, "num_tokens": 1425777026.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 2044.3359375, "completions/mean_terminated_length": 172.0, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 7.365941882133484, "epoch": 0.705701604643223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1776311411423687e-07, "loss": 0.0, "num_tokens": 1426900558.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1992.640625, "completions/mean_terminated_length": 759.6364135742188, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.290338635444641, "epoch": 0.706043018094913, "frac_reward_zero_std": 0.96875, "grad_norm": 0.02345259626746541, "learning_rate": 3.173039074700602e-07, "loss": 0.0051, "num_tokens": 1428002198.0, "reward": 0.01513671875, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01513671875, "rewards/tag_count_reward/std": 0.08503700792789459, "step": 2068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 2040.96484375, "completions/mean_terminated_length": 247.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 7.406112313270569, "epoch": 0.7063844315466029, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.168450313460577e-07, "loss": 0.0, "num_tokens": 1429128724.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 2040.28515625, "completions/mean_terminated_length": 73.0, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 7.382238388061523, "epoch": 0.7067258449982929, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.16386486394014e-07, "loss": 0.0, "num_tokens": 1430246662.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1321.0, "completions/mean_length": 2035.041015625, "completions/mean_terminated_length": 389.25, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.38689661026001, "epoch": 0.707067258449983, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1592827326524395e-07, "loss": 0.0, "num_tokens": 1431369883.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1427.0, "completions/mean_length": 2025.12890625, "completions/mean_terminated_length": 375.14288330078125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.376268982887268, "epoch": 0.7074086719016729, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.154703926105907e-07, "loss": 0.0, "num_tokens": 1432481853.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1074.0, "completions/mean_length": 2030.5859375, "completions/mean_terminated_length": 562.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 7.3840930461883545, "epoch": 0.7077500853533629, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1501284508042536e-07, "loss": 0.0, "num_tokens": 1433591769.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 2036.66015625, "completions/mean_terminated_length": 112.66667175292969, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 7.3277283906936646, "epoch": 0.708091498805053, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1455563132464567e-07, "loss": 0.0, "num_tokens": 1434713307.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 2030.837890625, "completions/mean_terminated_length": 290.6000061035156, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.400264739990234, "epoch": 0.7084329122567429, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1409875199267556e-07, "loss": 0.0, "num_tokens": 1435830824.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1148.0, "completions/mean_length": 2039.94140625, "completions/mean_terminated_length": 672.6666870117188, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "entropy": 7.4011300802230835, "epoch": 0.7087743257084329, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1364220773346346e-07, "loss": 0.0, "num_tokens": 1436957642.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1277.0, "completions/mean_length": 2035.314453125, "completions/mean_terminated_length": 424.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.374856352806091, "epoch": 0.7091157391601229, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1318599919548235e-07, "loss": 0.0, "num_tokens": 1438076731.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 2036.591796875, "completions/mean_terminated_length": 101.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.364192366600037, "epoch": 0.7094571526118129, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.127301270267282e-07, "loss": 0.0, "num_tokens": 1439190490.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 2044.556640625, "completions/mean_terminated_length": 285.0, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "entropy": 7.393409013748169, "epoch": 0.7097985660635029, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.122745918747193e-07, "loss": 0.0, "num_tokens": 1440316967.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 2034.99609375, "completions/mean_terminated_length": 383.5, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 7.396249771118164, "epoch": 0.7101399795151929, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1181939438649485e-07, "loss": 0.0, "num_tokens": 1441436133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1041.0, "completions/mean_length": 2042.685546875, "completions/mean_terminated_length": 687.5, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "entropy": 7.393303394317627, "epoch": 0.7104813929668828, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1136453520861494e-07, "loss": 0.0, "num_tokens": 1442560452.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 2037.857421875, "completions/mean_terminated_length": 317.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.434656977653503, "epoch": 0.7108228064185729, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1091001498715874e-07, "loss": 0.0, "num_tokens": 1443680443.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1361.0, "completions/mean_length": 2035.7421875, "completions/mean_terminated_length": 479.0, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 7.413076996803284, "epoch": 0.7111642198702629, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.104558343677242e-07, "loss": 0.0, "num_tokens": 1444795383.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 2040.48828125, "completions/mean_terminated_length": 125.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 7.4024529457092285, "epoch": 0.7115056333219529, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.100019939954267e-07, "loss": 0.0, "num_tokens": 1445909601.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 2033.732421875, "completions/mean_terminated_length": 221.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.38470721244812, "epoch": 0.7118470467736429, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0954849451489884e-07, "loss": 0.0, "num_tokens": 1447027320.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 2040.484375, "completions/mean_terminated_length": 124.0, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 7.390084981918335, "epoch": 0.7121884602253329, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.090953365702882e-07, "loss": 0.0, "num_tokens": 1448146544.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1818.0, "completions/mean_length": 2031.60546875, "completions/mean_terminated_length": 649.0, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 7.400025248527527, "epoch": 0.7125298736770229, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.08642520805258e-07, "loss": 0.0, "num_tokens": 1449268486.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 2034.1015625, "completions/mean_terminated_length": 269.0, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 7.381147027015686, "epoch": 0.7128712871287128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.081900478629848e-07, "loss": 0.0, "num_tokens": 1450393258.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1637.0, "completions/mean_length": 2040.158203125, "completions/mean_terminated_length": 709.6666870117188, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 7.381468057632446, "epoch": 0.7132127005804029, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.077379183861587e-07, "loss": 0.0, "num_tokens": 1451515163.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1051.0, "completions/mean_length": 2023.794921875, "completions/mean_terminated_length": 277.5714416503906, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.33277440071106, "epoch": 0.7135541140320929, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.072861330169818e-07, "loss": 0.0, "num_tokens": 1452624194.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1673.0, "completions/mean_length": 2039.693359375, "completions/mean_terminated_length": 630.3333740234375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 7.425787568092346, "epoch": 0.7138955274837828, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0683469239716753e-07, "loss": 0.0, "num_tokens": 1453743525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 2044.29296875, "completions/mean_terminated_length": 150.0, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 7.408957362174988, "epoch": 0.7142369409354729, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0638359716793926e-07, "loss": 0.0, "num_tokens": 1454870267.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1708.0, "completions/mean_length": 2022.51953125, "completions/mean_terminated_length": 417.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 7.361586689949036, "epoch": 0.7145783543871629, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.059328479700303e-07, "loss": 0.0, "num_tokens": 1455978581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1722.0, "completions/mean_length": 2043.470703125, "completions/mean_terminated_length": 888.5, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 7.451975584030151, "epoch": 0.7149197678388528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.054824454436818e-07, "loss": 0.0, "num_tokens": 1457099110.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1705.0, "completions/mean_length": 2025.64453125, "completions/mean_terminated_length": 617.25, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.388835549354553, "epoch": 0.7152611812905428, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0503239022864327e-07, "loss": 0.0, "num_tokens": 1458207824.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1077.0, "completions/mean_length": 2035.2890625, "completions/mean_terminated_length": 421.0, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 7.443751811981201, "epoch": 0.7156025947422329, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.045826829641701e-07, "loss": 0.0, "num_tokens": 1459325972.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1159.0, "completions/mean_length": 2029.318359375, "completions/mean_terminated_length": 453.8333435058594, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 7.425694227218628, "epoch": 0.7159440081939228, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0413332428902437e-07, "loss": 0.0, "num_tokens": 1460436023.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1036.0, "completions/mean_length": 2038.453125, "completions/mean_terminated_length": 418.66668701171875, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.44502592086792, "epoch": 0.7162854216456128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.036843148414722e-07, "loss": 0.0, "num_tokens": 1461563631.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1996.671875, "completions/mean_terminated_length": 588.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 7.323455095291138, "epoch": 0.7166268350973029, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.032356552592841e-07, "loss": 0.0, "num_tokens": 1462658935.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 2040.80078125, "completions/mean_terminated_length": 205.0, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 7.399184584617615, "epoch": 0.7169682485489929, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.027873461797334e-07, "loss": 0.0, "num_tokens": 1463772545.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.962890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1990.029296875, "completions/mean_terminated_length": 485.84210205078125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 7.333238124847412, "epoch": 0.7173096620006828, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0244831306486984, "learning_rate": 3.023393882395959e-07, "loss": 0.0015, "num_tokens": 1464872624.0, "reward": 0.017578125, "reward_std": 0.0078125, "rewards/accuracy_reward/mean": 0.001953125, "rewards/accuracy_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 2026.62890625, "completions/mean_terminated_length": 224.33334350585938, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 7.469751000404358, "epoch": 0.7176510754523728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.018917820751481e-07, "loss": 0.0, "num_tokens": 1465993810.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1832.0, "completions/mean_length": 2014.33984375, "completions/mean_terminated_length": 324.6000061035156, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 7.4054601192474365, "epoch": 0.7179924889040629, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0144452832216776e-07, "loss": 0.0, "num_tokens": 1467102944.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1949.0, "completions/mean_length": 2045.7890625, "completions/mean_terminated_length": 1482.0, "completions/min_length": 1015.0, "completions/min_terminated_length": 1015.0, "entropy": 7.387845754623413, "epoch": 0.7183339023557528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.00997627615931e-07, "loss": 0.0, "num_tokens": 1468227220.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 2041.958984375, "completions/mean_terminated_length": 501.5, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 7.400558114051819, "epoch": 0.7186753158074428, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.005510805912133e-07, "loss": 0.0, "num_tokens": 1469352911.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.978515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 2013.328125, "completions/mean_terminated_length": 434.18182373046875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.40800154209137, "epoch": 0.7190167292591328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.001048878822872e-07, "loss": 0.0, "num_tokens": 1470459399.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 2039.33203125, "completions/mean_terminated_length": 568.6666870117188, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "entropy": 7.375887513160706, "epoch": 0.7193581427108228, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.996590501229224e-07, "loss": 0.0, "num_tokens": 1471583729.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "entropy": 7.46484375, "epoch": 0.7196995561625128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.99213567946384e-07, "loss": 0.0, "num_tokens": 1472707505.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 2033.759765625, "completions/mean_terminated_length": 225.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 7.46381413936615, "epoch": 0.7200409696142028, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9876844198543266e-07, "loss": 0.0, "num_tokens": 1473821590.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 2042.38671875, "completions/mean_terminated_length": 611.0, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 7.426731467247009, "epoch": 0.7203823830658928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.983236728723224e-07, "loss": 0.0, "num_tokens": 1474950220.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 1976.818359375, "completions/mean_terminated_length": 391.4090881347656, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 7.315951228141785, "epoch": 0.7207237965175828, "frac_reward_zero_std": 0.96875, "grad_norm": 0.017141213587250317, "learning_rate": 2.9787926123880097e-07, "loss": 0.0003, "num_tokens": 1476044031.0, "reward": 0.017578125, "reward_std": 0.0078125, "rewards/accuracy_reward/mean": 0.001953125, "rewards/accuracy_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 2044.126953125, "completions/mean_terminated_length": 65.0, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 7.428090333938599, "epoch": 0.7210652099692728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.974352077161077e-07, "loss": 0.0, "num_tokens": 1477166560.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1693.0, "completions/mean_length": 2035.55859375, "completions/mean_terminated_length": 774.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.382616639137268, "epoch": 0.7214066234209627, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.96991512934974e-07, "loss": 0.0, "num_tokens": 1478282990.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 2031.232421875, "completions/mean_terminated_length": 331.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.377960205078125, "epoch": 0.7217480368726528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.965481775256211e-07, "loss": 0.0, "num_tokens": 1479393381.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.951171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2023.873046875, "completions/mean_terminated_length": 1553.8800048828125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 7.300370573997498, "epoch": 0.7220894503243428, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9610520211776e-07, "loss": 0.0, "num_tokens": 1480509444.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1714.0, "completions/mean_length": 2034.62890625, "completions/mean_terminated_length": 678.7999877929688, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.376374006271362, "epoch": 0.7224308637760328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.956625873405905e-07, "loss": 0.0, "num_tokens": 1481623734.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1650.0, "completions/mean_length": 2039.978515625, "completions/mean_terminated_length": 679.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 7.452058792114258, "epoch": 0.7227722772277227, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.952203338228002e-07, "loss": 0.0, "num_tokens": 1482751147.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 1986.5, "completions/mean_terminated_length": 473.6000061035156, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.290836334228516, "epoch": 0.7231136906794128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.947784421925631e-07, "loss": 0.0, "num_tokens": 1483847499.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1571.0, "completions/mean_length": 2017.84765625, "completions/mean_terminated_length": 1083.125, "completions/min_length": 628.0, "completions/min_terminated_length": 628.0, "entropy": 7.2691532373428345, "epoch": 0.7234551041311028, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.943369130775399e-07, "loss": 0.0, "num_tokens": 1484967165.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.962890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2017.48046875, "completions/mean_terminated_length": 1225.5789794921875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.256956219673157, "epoch": 0.7237965175827927, "frac_reward_zero_std": 0.96875, "grad_norm": 0.10785764355049576, "learning_rate": 2.9389574710487547e-07, "loss": 0.0065, "num_tokens": 1486075443.0, "reward": 0.01318359375, "reward_std": 0.003739949781447649, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01318359375, "rewards/tag_count_reward/std": 0.07628239691257477, "step": 2120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1996.521484375, "completions/mean_terminated_length": 792.90478515625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 7.276399612426758, "epoch": 0.7241379310344828, "frac_reward_zero_std": 0.96875, "grad_norm": 0.2919137742099395, "learning_rate": 2.934549449011997e-07, "loss": 0.0065, "num_tokens": 1487186334.0, "reward": 0.0146484375, "reward_std": 0.002668476663529873, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0146484375, "rewards/tag_count_reward/std": 0.08293935656547546, "step": 2121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 2033.845703125, "completions/mean_terminated_length": 236.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.437050223350525, "epoch": 0.7244793444861728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.930145070926254e-07, "loss": 0.0, "num_tokens": 1488305903.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2011.7109375, "completions/mean_terminated_length": 1163.2381591796875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 7.219998121261597, "epoch": 0.7248207579378627, "frac_reward_zero_std": 0.96875, "grad_norm": 0.24828229875942578, "learning_rate": 2.92574434304748e-07, "loss": 0.0067, "num_tokens": 1489414875.0, "reward": 0.01318359375, "reward_std": 0.003739949781447649, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01318359375, "rewards/tag_count_reward/std": 0.07628239691257477, "step": 2123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1083.0, "completions/mean_length": 2014.626953125, "completions/mean_terminated_length": 339.3000183105469, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.439839839935303, "epoch": 0.7251621713895527, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.921347271626442e-07, "loss": 0.0, "num_tokens": 1490522764.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1482.0, "completions/mean_length": 2032.375, "completions/mean_terminated_length": 448.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.372663140296936, "epoch": 0.7255035848412428, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9169538629087153e-07, "loss": 0.0, "num_tokens": 1491641116.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 2035.025390625, "completions/mean_terminated_length": 387.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 7.344809293746948, "epoch": 0.7258449982929327, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.912564123134671e-07, "loss": 0.0, "num_tokens": 1492759705.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2027.01171875, "completions/mean_terminated_length": 1510.7000732421875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 7.1361037492752075, "epoch": 0.7261864117446227, "frac_reward_zero_std": 0.96875, "grad_norm": 2.020717646002867, "learning_rate": 2.9081780585394694e-07, "loss": 0.0093, "num_tokens": 1493885343.0, "reward": 0.01318359375, "reward_std": 0.013893296010792255, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.06243881583213806, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00927734375, "rewards/tag_count_reward/std": 0.0588279627263546, "step": 2127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1184.0, "completions/mean_length": 2034.65234375, "completions/mean_terminated_length": 339.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.440819978713989, "epoch": 0.7265278251963128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9037956753530534e-07, "loss": 0.0, "num_tokens": 1495009741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1370.0, "completions/mean_length": 2027.732421875, "completions/mean_terminated_length": 318.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 7.384615182876587, "epoch": 0.7268692386480027, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8994169798001334e-07, "loss": 0.0, "num_tokens": 1496122708.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1749.0, "completions/mean_length": 2043.453125, "completions/mean_terminated_length": 884.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 7.417496919631958, "epoch": 0.7272106520996927, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.895041978100182e-07, "loss": 0.0, "num_tokens": 1497247980.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1204.0, "completions/mean_length": 2040.52734375, "completions/mean_terminated_length": 772.6666870117188, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "entropy": 7.443210244178772, "epoch": 0.7275520655513827, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8906706764674294e-07, "loss": 0.0, "num_tokens": 1498365610.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 2044.509765625, "completions/mean_terminated_length": 261.0, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 7.418003678321838, "epoch": 0.7278934790030728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.886303081110842e-07, "loss": 0.0, "num_tokens": 1499491855.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 2021.927734375, "completions/mean_terminated_length": 141.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.400700926780701, "epoch": 0.7282348924547627, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.88193919823413e-07, "loss": 0.0, "num_tokens": 1500601658.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1994.06640625, "completions/mean_terminated_length": 1280.9444580078125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 7.114712595939636, "epoch": 0.7285763059064527, "frac_reward_zero_std": 0.96875, "grad_norm": 0.10157397097274966, "learning_rate": 2.8775790340357265e-07, "loss": 0.0012, "num_tokens": 1501702732.0, "reward": 0.02392578125, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.02392578125, "rewards/tag_count_reward/std": 0.09786777198314667, "step": 2134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1111.0, "completions/mean_length": 2038.748046875, "completions/mean_terminated_length": 469.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 7.430038213729858, "epoch": 0.7289177193581428, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.873222594708785e-07, "loss": 0.0, "num_tokens": 1502818907.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1025.0, "completions/mean_length": 2031.705078125, "completions/mean_terminated_length": 379.3999938964844, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 7.354498982429504, "epoch": 0.7292591328098327, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8688698864411633e-07, "loss": 0.0, "num_tokens": 1503941524.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 2037.63671875, "completions/mean_terminated_length": 279.3333435058594, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.425422191619873, "epoch": 0.7296005462615227, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.864520915415426e-07, "loss": 0.0, "num_tokens": 1505069530.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1292.0, "completions/mean_length": 2032.22265625, "completions/mean_terminated_length": 701.6666870117188, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 7.4248809814453125, "epoch": 0.7299419597132127, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8601756878088236e-07, "loss": 0.0, "num_tokens": 1506188428.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1110.0, "completions/mean_length": 2017.3671875, "completions/mean_terminated_length": 479.6000061035156, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.406858205795288, "epoch": 0.7302833731649027, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.855834209793293e-07, "loss": 0.0, "num_tokens": 1507297288.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 2029.484375, "completions/mean_terminated_length": 152.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 7.45706045627594, "epoch": 0.7306247866165927, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.851496487535445e-07, "loss": 0.0, "num_tokens": 1508411744.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1109.0, "completions/mean_length": 2028.21875, "completions/mean_terminated_length": 601.1428833007812, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 7.400704979896545, "epoch": 0.7309662000682827, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8471625271965537e-07, "loss": 0.0, "num_tokens": 1509529008.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 2023.078125, "completions/mean_terminated_length": 453.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.41650390625, "epoch": 0.7313076135199726, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.842832334932554e-07, "loss": 0.0, "num_tokens": 1510635992.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 2021.0, "completions/mean_terminated_length": 320.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.384036898612976, "epoch": 0.7316490269716627, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.838505916894023e-07, "loss": 0.0, "num_tokens": 1511746264.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 2037.6953125, "completions/mean_terminated_length": 289.3333435058594, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 7.414398670196533, "epoch": 0.7319904404233527, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.834183279226181e-07, "loss": 0.0, "num_tokens": 1512866460.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1815.0, "completions/mean_length": 2047.544921875, "completions/mean_terminated_length": 1815.0, "completions/min_length": 1815.0, "completions/min_terminated_length": 1815.0, "entropy": 7.400102019309998, "epoch": 0.7323318538750426, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.829864428068875e-07, "loss": 0.0, "num_tokens": 1513991203.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1919.0, "completions/mean_length": 2034.1875, "completions/mean_terminated_length": 633.6000366210938, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.378037810325623, "epoch": 0.7326732673267327, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.825549369556578e-07, "loss": 0.0, "num_tokens": 1515113731.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.966796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1028.0, "completions/mean_length": 2005.107421875, "completions/mean_terminated_length": 756.1764526367188, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "entropy": 7.350157380104065, "epoch": 0.7330146807784227, "frac_reward_zero_std": 0.96875, "grad_norm": 0.017832460484145156, "learning_rate": 2.821238109818374e-07, "loss": 0.0013, "num_tokens": 1516222426.0, "reward": 0.01416015625, "reward_std": 0.003149319440126419, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01416015625, "rewards/tag_count_reward/std": 0.08078429847955704, "step": 2147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 2015.4609375, "completions/mean_terminated_length": 196.88888549804688, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.4470027685165405, "epoch": 0.7333560942301127, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8169306549779526e-07, "loss": 0.0, "num_tokens": 1517342470.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1123.0, "completions/mean_length": 1975.32421875, "completions/mean_terminated_length": 356.6363830566406, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.389275550842285, "epoch": 0.7336975076818026, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8126270111535945e-07, "loss": 0.0, "num_tokens": 1518433836.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1067.0, "completions/mean_length": 2026.330078125, "completions/mean_terminated_length": 463.0000305175781, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 7.378563046455383, "epoch": 0.7340389211334927, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8083271844581723e-07, "loss": 0.0, "num_tokens": 1519544165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 2031.8671875, "completions/mean_terminated_length": 396.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 7.416071057319641, "epoch": 0.7343803345851827, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8040311809991334e-07, "loss": 0.0, "num_tokens": 1520656817.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 2034.96875, "completions/mean_terminated_length": 380.0, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 7.416515469551086, "epoch": 0.7347217480368726, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7997390068784967e-07, "loss": 0.0, "num_tokens": 1521773393.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1764.0, "completions/mean_length": 2028.845703125, "completions/mean_terminated_length": 647.0, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 7.441860675811768, "epoch": 0.7350631614885627, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.795450668192842e-07, "loss": 0.0, "num_tokens": 1522883938.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1217.0, "completions/mean_length": 2034.18359375, "completions/mean_terminated_length": 869.0, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "entropy": 7.392654538154602, "epoch": 0.7354045749402527, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.791166171033301e-07, "loss": 0.0, "num_tokens": 1523998960.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1382.0, "completions/mean_length": 1981.806640625, "completions/mean_terminated_length": 434.1428527832031, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 7.3314666748046875, "epoch": 0.7357459883919426, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7868855214855465e-07, "loss": 0.0, "num_tokens": 1525098573.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1234.0, "completions/mean_length": 2030.46875, "completions/mean_terminated_length": 552.0, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 7.41617476940155, "epoch": 0.7360874018436326, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.78260872562979e-07, "loss": 0.0, "num_tokens": 1526214365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1800.0, "completions/mean_length": 2043.998046875, "completions/mean_terminated_length": 1023.5, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "entropy": 7.478433609008789, "epoch": 0.7364288152953227, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.778335789540767e-07, "loss": 0.0, "num_tokens": 1527330620.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1125.0, "completions/mean_length": 2021.15625, "completions/mean_terminated_length": 330.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.398069143295288, "epoch": 0.7367702287470126, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.774066719287729e-07, "loss": 0.0, "num_tokens": 1528452572.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1069.0, "completions/mean_length": 2039.646484375, "completions/mean_terminated_length": 622.3333740234375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 7.444293975830078, "epoch": 0.7371116421987026, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7698015209344404e-07, "loss": 0.0, "num_tokens": 1529572023.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1115.0, "completions/mean_length": 2022.900390625, "completions/mean_terminated_length": 441.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.417786359786987, "epoch": 0.7374530556503927, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.765540200539166e-07, "loss": 0.0, "num_tokens": 1530685828.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 2026.330078125, "completions/mean_terminated_length": 661.125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 7.436994314193726, "epoch": 0.7377944691020826, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7612827641546566e-07, "loss": 0.0, "num_tokens": 1531803821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1049.0, "completions/mean_length": 2029.15625, "completions/mean_terminated_length": 440.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.416407108306885, "epoch": 0.7381358825537726, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.757029217828153e-07, "loss": 0.0, "num_tokens": 1532914733.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1033.0, "completions/mean_length": 2027.544921875, "completions/mean_terminated_length": 302.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.444401741027832, "epoch": 0.7384772960054626, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7527795676013654e-07, "loss": 0.0, "num_tokens": 1534035892.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1876.0, "completions/mean_length": 2033.423828125, "completions/mean_terminated_length": 555.4000244140625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.413720726966858, "epoch": 0.7388187094571527, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7485338195104736e-07, "loss": 0.0, "num_tokens": 1535153181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 2032.947265625, "completions/mean_terminated_length": 506.6000061035156, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.453753471374512, "epoch": 0.7391601229088426, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.744291979586112e-07, "loss": 0.0, "num_tokens": 1536262210.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1070.0, "completions/mean_length": 2032.591796875, "completions/mean_terminated_length": 470.20001220703125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 7.409167170524597, "epoch": 0.7395015363605326, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.740054053853369e-07, "loss": 0.0, "num_tokens": 1537384177.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1186.0, "completions/mean_length": 2033.0, "completions/mean_terminated_length": 512.0, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 7.440397500991821, "epoch": 0.7398429498122226, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.735820048331765e-07, "loss": 0.0, "num_tokens": 1538501169.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.962890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2036.64453125, "completions/mean_terminated_length": 1742.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.2592562437057495, "epoch": 0.7401843632639126, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.731589969035261e-07, "loss": 0.0, "num_tokens": 1539630587.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 2020.529296875, "completions/mean_terminated_length": 485.22222900390625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 7.442092061042786, "epoch": 0.7405257767156026, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7273638219722315e-07, "loss": 0.0, "num_tokens": 1540739674.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1105.0, "completions/mean_length": 2034.44140625, "completions/mean_terminated_length": 312.5, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.443757176399231, "epoch": 0.7408671901672926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.723141613145476e-07, "loss": 0.0, "num_tokens": 1541861660.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1192.0, "completions/mean_length": 2028.361328125, "completions/mean_terminated_length": 611.5714721679688, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 7.359526515007019, "epoch": 0.7412086036189826, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7189233485521934e-07, "loss": 0.0, "num_tokens": 1542975205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 2025.185546875, "completions/mean_terminated_length": 101.16667175292969, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.424191951751709, "epoch": 0.7415500170706726, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.714709034183984e-07, "loss": 0.0, "num_tokens": 1544082580.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1052.0, "completions/mean_length": 2029.1875, "completions/mean_terminated_length": 442.66668701171875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 7.435539960861206, "epoch": 0.7418914305223626, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7104986760268324e-07, "loss": 0.0, "num_tokens": 1545199300.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 689.0, "completions/mean_length": 2034.43359375, "completions/mean_terminated_length": 311.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 7.402535080909729, "epoch": 0.7422328439740525, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.70629228006111e-07, "loss": 0.0, "num_tokens": 1546315778.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1062.0, "completions/mean_length": 2034.677734375, "completions/mean_terminated_length": 342.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.428807973861694, "epoch": 0.7425742574257426, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.702089852261553e-07, "loss": 0.0, "num_tokens": 1547441149.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.962890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2018.939453125, "completions/mean_terminated_length": 1264.894775390625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 7.271121501922607, "epoch": 0.7429156708774326, "frac_reward_zero_std": 0.96875, "grad_norm": 0.6046849398981464, "learning_rate": 2.6978913985972683e-07, "loss": -0.0033, "num_tokens": 1548554270.0, "reward": 0.01171875, "reward_std": 0.014823175966739655, "rewards/accuracy_reward/mean": 0.008064515888690948, "rewards/accuracy_reward/std": 0.0895301103591919, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00390625, "rewards/tag_count_reward/std": 0.0347534641623497, "step": 2176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1530.0, "completions/mean_length": 2034.416015625, "completions/mean_terminated_length": 657.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.384331226348877, "epoch": 0.7432570843291225, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6936969250317154e-07, "loss": 0.0, "num_tokens": 1549684787.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 2018.736328125, "completions/mean_terminated_length": 383.22222900390625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.415239334106445, "epoch": 0.7435984977808126, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6895064375227e-07, "loss": 0.0, "num_tokens": 1550792508.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1728.0, "completions/mean_length": 1992.94921875, "completions/mean_terminated_length": 766.8181762695312, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 7.312429189682007, "epoch": 0.7439399112325026, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.685319942022364e-07, "loss": 0.0, "num_tokens": 1551898754.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 2033.861328125, "completions/mean_terminated_length": 238.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.409624338150024, "epoch": 0.7442813246841925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6811374444771833e-07, "loss": 0.0, "num_tokens": 1553011259.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1091.0, "completions/mean_length": 1974.833984375, "completions/mean_terminated_length": 487.125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 7.377131819725037, "epoch": 0.7446227381358825, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.676958950827952e-07, "loss": 0.0, "num_tokens": 1554112966.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1090.0, "completions/mean_length": 2035.66015625, "completions/mean_terminated_length": 784.4000244140625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 7.387057900428772, "epoch": 0.7449641515875726, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6727844670097776e-07, "loss": 0.0, "num_tokens": 1555232984.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 2022.333984375, "completions/mean_terminated_length": 170.71429443359375, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 7.415935397148132, "epoch": 0.7453055650392626, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.668613998952074e-07, "loss": 0.0, "num_tokens": 1556341283.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1080.0, "completions/mean_length": 2029.193359375, "completions/mean_terminated_length": 443.16668701171875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 7.3757665157318115, "epoch": 0.7456469784909525, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6644475525785497e-07, "loss": 0.0, "num_tokens": 1557465062.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 2026.5, "completions/mean_terminated_length": 475.4285888671875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.4129520654678345, "epoch": 0.7459883919426425, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6602851338072e-07, "loss": 0.0, "num_tokens": 1558575574.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 2030.30078125, "completions/mean_terminated_length": 235.60000610351562, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.406508564949036, "epoch": 0.7463298053943326, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.656126748550301e-07, "loss": 0.0, "num_tokens": 1559696144.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 2040.095703125, "completions/mean_terminated_length": 24.5, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 7.482832193374634, "epoch": 0.7466712188460225, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6519724027143977e-07, "loss": 0.0, "num_tokens": 1560817665.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 2038.876953125, "completions/mean_terminated_length": 491.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 7.409849405288696, "epoch": 0.7470126322977125, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6478221022002987e-07, "loss": 0.0, "num_tokens": 1561937938.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1102.0, "completions/mean_length": 2038.296875, "completions/mean_terminated_length": 392.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 7.416775941848755, "epoch": 0.7473540457494026, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.643675852903069e-07, "loss": 0.0, "num_tokens": 1563056410.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1099.0, "completions/mean_length": 2032.47265625, "completions/mean_terminated_length": 458.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 7.351398825645447, "epoch": 0.7476954592010925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6395336607120155e-07, "loss": 0.0, "num_tokens": 1564171100.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1909.0, "completions/mean_length": 2028.8828125, "completions/mean_terminated_length": 416.66668701171875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 7.425043106079102, "epoch": 0.7480368726527825, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.635395531510683e-07, "loss": 0.0, "num_tokens": 1565290112.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1205.0, "completions/mean_length": 2042.361328125, "completions/mean_terminated_length": 604.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.395302414894104, "epoch": 0.7483782861044725, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6312614711768475e-07, "loss": 0.0, "num_tokens": 1566415657.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 2037.66015625, "completions/mean_terminated_length": 283.3333435058594, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 7.48689079284668, "epoch": 0.7487196995561625, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6271314855825034e-07, "loss": 0.0, "num_tokens": 1567526219.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1178.0, "completions/mean_length": 2035.291015625, "completions/mean_terminated_length": 421.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 7.336536884307861, "epoch": 0.7490611130078525, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6230055805938577e-07, "loss": 0.0, "num_tokens": 1568645728.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2040.919921875, "completions/mean_terminated_length": 1846.611083984375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 7.215132713317871, "epoch": 0.7494025264595425, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6188837620713223e-07, "loss": 0.0, "num_tokens": 1569773095.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1293.0, "completions/mean_length": 2014.05859375, "completions/mean_terminated_length": 310.20001220703125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 7.444048881530762, "epoch": 0.7497439399112324, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6147660358695063e-07, "loss": 0.0, "num_tokens": 1570885461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1087.0, "completions/mean_length": 2036.830078125, "completions/mean_terminated_length": 618.25, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 7.371320128440857, "epoch": 0.7500853533629225, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.610652407837201e-07, "loss": 0.0, "num_tokens": 1572009886.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 2037.767578125, "completions/mean_terminated_length": 301.66668701171875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 7.41507625579834, "epoch": 0.7504267668146125, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.606542883817381e-07, "loss": 0.0, "num_tokens": 1573127735.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 2036.962890625, "completions/mean_terminated_length": 164.33334350585938, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.404842376708984, "epoch": 0.7507681802663025, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.602437469647189e-07, "loss": 0.0, "num_tokens": 1574242804.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1342.0, "completions/mean_length": 2039.05078125, "completions/mean_terminated_length": 520.6666870117188, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 7.4215134382247925, "epoch": 0.7511095937179925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.598336171157932e-07, "loss": 0.0, "num_tokens": 1575358526.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 2040.07421875, "completions/mean_terminated_length": 19.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 7.4348061084747314, "epoch": 0.7514510071696825, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.594238994175072e-07, "loss": 0.0, "num_tokens": 1576479812.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1118.0, "completions/mean_length": 2035.779296875, "completions/mean_terminated_length": 483.75, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 7.45954966545105, "epoch": 0.7517924206213725, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.590145944518215e-07, "loss": 0.0, "num_tokens": 1577591059.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 2032.716796875, "completions/mean_terminated_length": 91.75, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.444973349571228, "epoch": 0.7521338340730624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5860570280011027e-07, "loss": 0.0, "num_tokens": 1578703650.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 2041.11328125, "completions/mean_terminated_length": 285.0, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "entropy": 7.376545310020447, "epoch": 0.7524752475247525, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.581972250431611e-07, "loss": 0.0, "num_tokens": 1579830284.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 2034.54296875, "completions/mean_terminated_length": 325.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 7.418949842453003, "epoch": 0.7528166609764425, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5778916176117314e-07, "loss": 0.0, "num_tokens": 1580945954.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 2033.044921875, "completions/mean_terminated_length": 516.6000366210938, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 7.434063673019409, "epoch": 0.7531580744281324, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5738151353375736e-07, "loss": 0.0, "num_tokens": 1582065289.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1031.0, "completions/mean_length": 2028.001953125, "completions/mean_terminated_length": 341.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.394159913063049, "epoch": 0.7534994878798225, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.569742809399347e-07, "loss": 0.0, "num_tokens": 1583181530.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 2033.5859375, "completions/mean_terminated_length": 203.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.365495324134827, "epoch": 0.7538409013315125, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5656746455813615e-07, "loss": 0.0, "num_tokens": 1584306038.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1060.0, "completions/mean_length": 2029.10546875, "completions/mean_terminated_length": 435.66668701171875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 7.36223030090332, "epoch": 0.7541823147832024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5616106496620125e-07, "loss": 0.0, "num_tokens": 1585426940.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1085.0, "completions/mean_length": 1993.763671875, "completions/mean_terminated_length": 659.5499877929688, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 7.246438980102539, "epoch": 0.7545237282348924, "frac_reward_zero_std": 0.96875, "grad_norm": 0.07204624789307261, "learning_rate": 2.557550827413776e-07, "loss": 0.0003, "num_tokens": 1586528883.0, "reward": 0.044921875, "reward_std": 0.0078125, "rewards/accuracy_reward/mean": 0.029296875, "rewards/accuracy_reward/std": 0.16880230605602264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 2036.861328125, "completions/mean_terminated_length": 147.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 7.415588974952698, "epoch": 0.7548651416865825, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5534951846032e-07, "loss": 0.0, "num_tokens": 1587643356.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 2029.0078125, "completions/mean_terminated_length": 103.20000457763672, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.444991230964661, "epoch": 0.7552065551382724, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5494437269908976e-07, "loss": 0.0, "num_tokens": 1588761920.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1310.0, "completions/mean_length": 2037.919921875, "completions/mean_terminated_length": 757.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.449827790260315, "epoch": 0.7555479685899624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.545396460331529e-07, "loss": 0.0, "num_tokens": 1589878919.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 2040.904296875, "completions/mean_terminated_length": 231.5, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 7.4264150857925415, "epoch": 0.7558893820416525, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5413533903738184e-07, "loss": 0.0, "num_tokens": 1591002374.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 2041.8984375, "completions/mean_terminated_length": 486.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 7.473604679107666, "epoch": 0.7562307954933425, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5373145228605103e-07, "loss": 0.0, "num_tokens": 1592117682.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1160.0, "completions/mean_length": 2028.369140625, "completions/mean_terminated_length": 372.8333435058594, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 7.448229193687439, "epoch": 0.7565722089450324, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5332798635283947e-07, "loss": 0.0, "num_tokens": 1593228815.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 2033.6640625, "completions/mean_terminated_length": 213.0, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 7.44201934337616, "epoch": 0.7569136223967224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5292494181082726e-07, "loss": 0.0, "num_tokens": 1594342275.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1090.0, "completions/mean_length": 2018.244140625, "completions/mean_terminated_length": 355.22222900390625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.444450736045837, "epoch": 0.7572550358484125, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5252231923249703e-07, "loss": 0.0, "num_tokens": 1595447328.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.962890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1105.0, "completions/mean_length": 2000.404296875, "completions/mean_terminated_length": 765.4210815429688, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.2699785232543945, "epoch": 0.7575964493001024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5212011918973085e-07, "loss": 0.0, "num_tokens": 1596545135.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 2032.9609375, "completions/mean_terminated_length": 123.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.412005066871643, "epoch": 0.7579378627517924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.517183422538122e-07, "loss": 0.0, "num_tokens": 1597667163.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1319.0, "completions/mean_length": 2030.404296875, "completions/mean_terminated_length": 761.0000610351562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 7.447129249572754, "epoch": 0.7582792762034825, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.513169889954221e-07, "loss": 0.0, "num_tokens": 1598783194.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 2033.67578125, "completions/mean_terminated_length": 581.2000122070312, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.420493483543396, "epoch": 0.7586206896551724, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.509160599846407e-07, "loss": 0.0, "num_tokens": 1599901748.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1771.0, "completions/mean_length": 2047.458984375, "completions/mean_terminated_length": 1771.0, "completions/min_length": 1771.0, "completions/min_terminated_length": 1771.0, "entropy": 7.411186218261719, "epoch": 0.7589621031068624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5051555579094493e-07, "loss": 0.0, "num_tokens": 1601036063.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1074.0, "completions/mean_length": 2034.654296875, "completions/mean_terminated_length": 339.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 7.418184161186218, "epoch": 0.7593035165585524, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.501154769832089e-07, "loss": 0.0, "num_tokens": 1602148910.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 2035.697265625, "completions/mean_terminated_length": 473.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 7.403064727783203, "epoch": 0.7596449300102424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4971582412970195e-07, "loss": 0.0, "num_tokens": 1603264803.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 2044.029296875, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 7.391099572181702, "epoch": 0.7599863434619324, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4931659779808874e-07, "loss": 0.0, "num_tokens": 1604385090.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2044.00390625, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.395167350769043, "epoch": 0.7603277569136224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.489177985554282e-07, "loss": 0.0, "num_tokens": 1605515652.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1205.0, "completions/mean_length": 2036.2421875, "completions/mean_terminated_length": 543.0, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 7.400001525878906, "epoch": 0.7606691703653123, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.485194269681723e-07, "loss": 0.0, "num_tokens": 1606630976.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 2029.330078125, "completions/mean_terminated_length": 136.1999969482422, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.413339734077454, "epoch": 0.7610105838170024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.481214836021657e-07, "loss": 0.0, "num_tokens": 1607746169.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1120.0, "completions/mean_length": 2028.4140625, "completions/mean_terminated_length": 376.66668701171875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.351840853691101, "epoch": 0.7613519972686924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4772396902264505e-07, "loss": 0.0, "num_tokens": 1608860541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 2033.716796875, "completions/mean_terminated_length": 219.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 7.40331768989563, "epoch": 0.7616934107203824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4732688379423744e-07, "loss": 0.0, "num_tokens": 1609974444.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1786.0, "completions/mean_length": 2019.09765625, "completions/mean_terminated_length": 1225.888916015625, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.320581555366516, "epoch": 0.7620348241720724, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0663004812689912, "learning_rate": 2.4693022848096054e-07, "loss": -0.0011, "num_tokens": 1611080686.0, "reward": 0.01513671875, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01513671875, "rewards/tag_count_reward/std": 0.08503700792789459, "step": 2232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1094.0, "completions/mean_length": 2031.076171875, "completions/mean_terminated_length": 603.8333740234375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.381928205490112, "epoch": 0.7623762376237624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.465340036462213e-07, "loss": 0.0, "num_tokens": 1612210949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 2040.650390625, "completions/mean_terminated_length": 1107.25, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 7.3730621337890625, "epoch": 0.7627176510754524, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4613820985281524e-07, "loss": 0.0, "num_tokens": 1613337618.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 2035.896484375, "completions/mean_terminated_length": 498.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.426261901855469, "epoch": 0.7630590645271423, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.457428476629253e-07, "loss": 0.0, "num_tokens": 1614458349.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.951171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2018.392578125, "completions/mean_terminated_length": 1441.6400146484375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 7.357899308204651, "epoch": 0.7634004779788324, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.45347917638122e-07, "loss": 0.0, "num_tokens": 1615577254.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1090.0, "completions/mean_length": 2021.822265625, "completions/mean_terminated_length": 372.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.4341961145401, "epoch": 0.7637418914305224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4495342033936115e-07, "loss": 0.0, "num_tokens": 1616693003.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 2024.931640625, "completions/mean_terminated_length": 360.71429443359375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.422170996665955, "epoch": 0.7640833048822123, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4455935632698474e-07, "loss": 0.0, "num_tokens": 1617812824.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 2030.34375, "completions/mean_terminated_length": 240.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.400160312652588, "epoch": 0.7644247183339024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4416572616071895e-07, "loss": 0.0, "num_tokens": 1618926328.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 2034.30078125, "completions/mean_terminated_length": 645.2000122070312, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 7.388180255889893, "epoch": 0.7647661317855924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4377253039967396e-07, "loss": 0.0, "num_tokens": 1620047634.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 2019.607421875, "completions/mean_terminated_length": 230.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.3537222146987915, "epoch": 0.7651075452372823, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.433797696023424e-07, "loss": 0.0, "num_tokens": 1621154729.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 2045.236328125, "completions/mean_terminated_length": 633.0, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "entropy": 7.405512571334839, "epoch": 0.7654489586889723, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4298744432659973e-07, "loss": 0.0, "num_tokens": 1622284738.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 2034.24609375, "completions/mean_terminated_length": 287.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.395649194717407, "epoch": 0.7657903721406624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4259555512970206e-07, "loss": 0.0, "num_tokens": 1623404832.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2028.216796875, "completions/mean_terminated_length": 1565.666748046875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.1907570362091064, "epoch": 0.7661317855923523, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0201937019118897, "learning_rate": 2.422041025682869e-07, "loss": 0.0039, "num_tokens": 1624528527.0, "reward": 0.00927734375, "reward_std": 0.003149319440126419, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00927734375, "rewards/tag_count_reward/std": 0.05451139807701111, "step": 2244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 2019.734375, "completions/mean_terminated_length": 440.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.40966534614563, "epoch": 0.7664731990440423, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4181308719837103e-07, "loss": 0.0, "num_tokens": 1625643383.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1420.0, "completions/mean_length": 1942.2578125, "completions/mean_terminated_length": 544.1111450195312, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.263198137283325, "epoch": 0.7668146124957324, "frac_reward_zero_std": 0.9375, "grad_norm": 0.07732521251792385, "learning_rate": 2.414225095753506e-07, "loss": 0.0008, "num_tokens": 1626726155.0, "reward": 0.0576171875, "reward_std": 0.021095803007483482, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.16324250400066376, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0302734375, "rewards/tag_count_reward/std": 0.11833616346120834, "step": 2246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 2027.029296875, "completions/mean_terminated_length": 258.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 7.389182806015015, "epoch": 0.7671560259474224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4103237025399946e-07, "loss": 0.0, "num_tokens": 1627841594.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1995.96484375, "completions/mean_terminated_length": 715.9000244140625, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.368919730186462, "epoch": 0.7674974393991123, "frac_reward_zero_std": 0.96875, "grad_norm": 0.06912903915962802, "learning_rate": 2.406426697884696e-07, "loss": -0.0, "num_tokens": 1628945176.0, "reward": 0.01513671875, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01513671875, "rewards/tag_count_reward/std": 0.08503700792789459, "step": 2248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1213.0, "completions/mean_length": 2041.130859375, "completions/mean_terminated_length": 875.6666870117188, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "entropy": 7.40398907661438, "epoch": 0.7678388528508023, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4025340873228897e-07, "loss": 0.0, "num_tokens": 1630067675.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.955078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1200.0, "completions/mean_length": 1981.279296875, "completions/mean_terminated_length": 562.7391357421875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 7.376506567001343, "epoch": 0.7681802663024924, "frac_reward_zero_std": 0.96875, "grad_norm": 0.03495459517748063, "learning_rate": 2.3986458763836177e-07, "loss": 0.0009, "num_tokens": 1631157626.0, "reward": 0.029296875, "reward_std": 0.016010859981179237, "rewards/accuracy_reward/mean": 0.013671875, "rewards/accuracy_reward/std": 0.1162383034825325, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1330.0, "completions/mean_length": 2024.560546875, "completions/mean_terminated_length": 547.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.424293279647827, "epoch": 0.7685216797541823, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3947620705896734e-07, "loss": 0.0, "num_tokens": 1632271353.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 2045.791015625, "completions/mean_terminated_length": 917.0, "completions/min_length": 917.0, "completions/min_terminated_length": 917.0, "entropy": 7.44263482093811, "epoch": 0.7688630932058723, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3908826754575923e-07, "loss": 0.0, "num_tokens": 1633399246.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1259.0, "completions/mean_length": 2037.76953125, "completions/mean_terminated_length": 738.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 7.328388333320618, "epoch": 0.7692045066575623, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3870076964976424e-07, "loss": 0.0, "num_tokens": 1634531224.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 2031.0625, "completions/mean_terminated_length": 602.6666870117188, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 7.4158676862716675, "epoch": 0.7695459201092523, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3831371392138237e-07, "loss": 0.0, "num_tokens": 1635648920.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1061.0, "completions/mean_length": 2032.9765625, "completions/mean_terminated_length": 509.6000061035156, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 7.414567589759827, "epoch": 0.7698873335609423, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3792710091038506e-07, "loss": 0.0, "num_tokens": 1636758044.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1098.0, "completions/mean_length": 2040.15234375, "completions/mean_terminated_length": 708.6666870117188, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "entropy": 7.425726056098938, "epoch": 0.7702287470126323, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3754093116591534e-07, "loss": 0.0, "num_tokens": 1637883482.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 2029.666015625, "completions/mean_terminated_length": 170.60000610351562, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 7.429205536842346, "epoch": 0.7705701604643223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3715520523648647e-07, "loss": 0.0, "num_tokens": 1638998191.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1143.0, "completions/mean_length": 2029.4375, "completions/mean_terminated_length": 464.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.40562891960144, "epoch": 0.7709115739160123, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3676992366998136e-07, "loss": 0.0, "num_tokens": 1640111391.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1220.0, "completions/mean_length": 2029.2109375, "completions/mean_terminated_length": 444.66668701171875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.441134572029114, "epoch": 0.7712529873677023, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3638508701365153e-07, "loss": 0.0, "num_tokens": 1641218699.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1918.0, "completions/mean_length": 2041.130859375, "completions/mean_terminated_length": 875.6666870117188, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.442109107971191, "epoch": 0.7715944008193922, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3600069581411693e-07, "loss": 0.0, "num_tokens": 1642338590.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1213.0, "completions/mean_length": 2032.8984375, "completions/mean_terminated_length": 501.6000061035156, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 7.441718101501465, "epoch": 0.7719358142710823, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.356167506173644e-07, "loss": 0.0, "num_tokens": 1643447626.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 2033.55859375, "completions/mean_terminated_length": 569.2000122070312, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.410788059234619, "epoch": 0.7722772277227723, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3523325196874746e-07, "loss": 0.0, "num_tokens": 1644560376.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1131.0, "completions/mean_length": 1987.916015625, "completions/mean_terminated_length": 583.0952758789062, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.354452967643738, "epoch": 0.7726186411744623, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3485020041298544e-07, "loss": 0.0, "num_tokens": 1645649837.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1748.0, "completions/mean_length": 2024.259765625, "completions/mean_terminated_length": 528.625, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.426469683647156, "epoch": 0.7729600546261522, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.344675964941627e-07, "loss": 0.0, "num_tokens": 1646760322.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 2043.333984375, "completions/mean_terminated_length": 853.5, "completions/min_length": 757.0, "completions/min_terminated_length": 757.0, "entropy": 7.432860970497131, "epoch": 0.7733014680778423, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3408544075572727e-07, "loss": 0.0, "num_tokens": 1647876813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1634.0, "completions/mean_length": 2035.69140625, "completions/mean_terminated_length": 472.5, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 7.396092057228088, "epoch": 0.7736428815295323, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.337037337404913e-07, "loss": 0.0, "num_tokens": 1648991807.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1165.0, "completions/mean_length": 2026.626953125, "completions/mean_terminated_length": 484.71429443359375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 7.390552282333374, "epoch": 0.7739842949812222, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.333224759906288e-07, "loss": 0.0, "num_tokens": 1650106928.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 2035.58203125, "completions/mean_terminated_length": 776.4000244140625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 7.403513193130493, "epoch": 0.7743257084329123, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3294166804767634e-07, "loss": 0.0, "num_tokens": 1651224778.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1333.0, "completions/mean_length": 2028.6875, "completions/mean_terminated_length": 400.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.405599474906921, "epoch": 0.7746671218846023, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3256131045253127e-07, "loss": 0.0, "num_tokens": 1652339994.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 2037.66796875, "completions/mean_terminated_length": 284.66668701171875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 7.382565140724182, "epoch": 0.7750085353362922, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3218140374545137e-07, "loss": 0.0, "num_tokens": 1653457264.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 2036.9140625, "completions/mean_terminated_length": 156.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 7.433531403541565, "epoch": 0.7753499487879822, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3180194846605364e-07, "loss": 0.0, "num_tokens": 1654576004.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1282.0, "completions/mean_length": 2035.826171875, "completions/mean_terminated_length": 489.75, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 7.44642961025238, "epoch": 0.7756913622396723, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3142294515331437e-07, "loss": 0.0, "num_tokens": 1655692651.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 2036.46875, "completions/mean_terminated_length": 572.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.421375513076782, "epoch": 0.7760327756913622, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3104439434556775e-07, "loss": 0.0, "num_tokens": 1656807627.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 2039.650390625, "completions/mean_terminated_length": 623.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.388588905334473, "epoch": 0.7763741891430522, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3066629658050482e-07, "loss": 0.0, "num_tokens": 1657931288.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 2035.11328125, "completions/mean_terminated_length": 398.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 7.413827657699585, "epoch": 0.7767156025947423, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3028865239517363e-07, "loss": 0.0, "num_tokens": 1659047634.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1284.0, "completions/mean_length": 2035.076171875, "completions/mean_terminated_length": 393.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.403693675994873, "epoch": 0.7770570160464322, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.299114623259778e-07, "loss": 0.0, "num_tokens": 1660163465.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 2032.16015625, "completions/mean_terminated_length": 426.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.426462769508362, "epoch": 0.7773984294981222, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.29534726908676e-07, "loss": 0.0, "num_tokens": 1661274763.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1725.0, "completions/mean_length": 2037.876953125, "completions/mean_terminated_length": 752.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 7.4530826807022095, "epoch": 0.7777398429498122, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2915844667838074e-07, "loss": 0.0, "num_tokens": 1662396220.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1184.0, "completions/mean_length": 2024.99609375, "completions/mean_terminated_length": 365.4285888671875, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.3628329038619995, "epoch": 0.7780812564015022, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2878262216955863e-07, "loss": 0.0, "num_tokens": 1663510874.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 2033.6640625, "completions/mean_terminated_length": 213.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 7.37825345993042, "epoch": 0.7784226698531922, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2840725391602826e-07, "loss": 0.0, "num_tokens": 1664632990.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1197.0, "completions/mean_length": 2022.3203125, "completions/mean_terminated_length": 404.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.392406940460205, "epoch": 0.7787640833048822, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2803234245096062e-07, "loss": 0.0, "num_tokens": 1665740162.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1050.0, "completions/mean_length": 2022.068359375, "completions/mean_terminated_length": 388.375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.389852166175842, "epoch": 0.7791054967565723, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2765788830687782e-07, "loss": 0.0, "num_tokens": 1666857733.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 2033.5859375, "completions/mean_terminated_length": 203.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.4082183837890625, "epoch": 0.7794469102082622, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2728389201565252e-07, "loss": 0.0, "num_tokens": 1667977489.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1923.0, "completions/mean_length": 2036.828125, "completions/mean_terminated_length": 618.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 7.3890721797943115, "epoch": 0.7797883236599522, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.269103541085065e-07, "loss": 0.0, "num_tokens": 1669094905.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1062.0, "completions/mean_length": 2040.0, "completions/mean_terminated_length": 682.6666870117188, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 7.426455736160278, "epoch": 0.7801297371116422, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2653727511601115e-07, "loss": 0.0, "num_tokens": 1670218201.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1148.0, "completions/mean_length": 2027.220703125, "completions/mean_terminated_length": 528.1428833007812, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.430817365646362, "epoch": 0.7804711505633322, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.261646555680855e-07, "loss": 0.0, "num_tokens": 1671330202.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 2040.8359375, "completions/mean_terminated_length": 214.0, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 7.464882731437683, "epoch": 0.7808125640150222, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2579249599399616e-07, "loss": 0.0, "num_tokens": 1672449286.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1098.0, "completions/mean_length": 2034.79296875, "completions/mean_terminated_length": 357.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.399934649467468, "epoch": 0.7811539774667122, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.254207969223566e-07, "loss": 0.0, "num_tokens": 1673564604.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 716.0, "completions/mean_length": 2041.666015625, "completions/mean_terminated_length": 426.5, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 7.362803220748901, "epoch": 0.7814953909184021, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.25049558881126e-07, "loss": 0.0, "num_tokens": 1674693601.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1828.0, "completions/mean_length": 2039.9296875, "completions/mean_terminated_length": 1015.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 7.47403347492218, "epoch": 0.7818368043700922, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2467878239760851e-07, "loss": 0.0, "num_tokens": 1675819341.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1324.0, "completions/mean_length": 2029.09765625, "completions/mean_terminated_length": 665.4285888671875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 7.458285212516785, "epoch": 0.7821782178217822, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.243084679984531e-07, "loss": 0.0, "num_tokens": 1676928367.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1088.0, "completions/mean_length": 2035.744140625, "completions/mean_terminated_length": 479.25, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 7.413696646690369, "epoch": 0.7825196312734721, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2393861620965205e-07, "loss": 0.0, "num_tokens": 1678052076.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 2035.064453125, "completions/mean_terminated_length": 392.25, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 7.360121726989746, "epoch": 0.7828610447251622, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2356922755654068e-07, "loss": 0.0, "num_tokens": 1679174541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 742.0, "completions/mean_length": 2020.421875, "completions/mean_terminated_length": 283.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.465755939483643, "epoch": 0.7832024581768522, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2320030256379656e-07, "loss": 0.0, "num_tokens": 1680278357.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 2036.451171875, "completions/mean_terminated_length": 77.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.465723752975464, "epoch": 0.7835438716285421, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2283184175543867e-07, "loss": 0.0, "num_tokens": 1681398028.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1759.0, "completions/mean_length": 2034.904296875, "completions/mean_terminated_length": 707.0, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 7.436160564422607, "epoch": 0.7838852850802321, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2246384565482645e-07, "loss": 0.0, "num_tokens": 1682518299.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1172.0, "completions/mean_length": 2025.50390625, "completions/mean_terminated_length": 402.5714416503906, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 7.4414530992507935, "epoch": 0.7842266985319222, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.220963147846595e-07, "loss": 0.0, "num_tokens": 1683634365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1424.0, "completions/mean_length": 2025.580078125, "completions/mean_terminated_length": 408.14288330078125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.421186447143555, "epoch": 0.7845681119836122, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.217292496669764e-07, "loss": 0.0, "num_tokens": 1684757958.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 2028.08984375, "completions/mean_terminated_length": 349.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.469198822975159, "epoch": 0.7849095254353021, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2136265082315442e-07, "loss": 0.0, "num_tokens": 1685873236.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 2025.787109375, "completions/mean_terminated_length": 152.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 7.438539028167725, "epoch": 0.7852509388869922, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.209965187739084e-07, "loss": 0.0, "num_tokens": 1686990615.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1350.0, "completions/mean_length": 2006.458984375, "completions/mean_terminated_length": 275.5833435058594, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.427402853965759, "epoch": 0.7855923523386822, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2063085403929024e-07, "loss": 0.0, "num_tokens": 1688099202.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1988.24609375, "completions/mean_terminated_length": 518.2999877929688, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 7.310879945755005, "epoch": 0.7859337657903721, "frac_reward_zero_std": 0.96875, "grad_norm": 0.02828288012928293, "learning_rate": 2.2026565713868782e-07, "loss": -0.0023, "num_tokens": 1689193008.0, "reward": 0.01953125, "reward_std": 0.012757759541273117, "rewards/accuracy_reward/mean": 0.005859375, "rewards/accuracy_reward/std": 0.07639661431312561, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.013671875, "rewards/tag_count_reward/std": 0.07856711745262146, "step": 2302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1850.0, "completions/mean_length": 2030.751953125, "completions/mean_terminated_length": 786.4285888671875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 7.4101338386535645, "epoch": 0.7862751792420621, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1990092859082492e-07, "loss": 0.0, "num_tokens": 1690309713.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1354.0, "completions/mean_length": 2035.396484375, "completions/mean_terminated_length": 434.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.440084099769592, "epoch": 0.7866165926937522, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1953666891375966e-07, "loss": 0.0, "num_tokens": 1691426444.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 727.0, "completions/mean_length": 2025.98828125, "completions/mean_terminated_length": 169.6666717529297, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.462818622589111, "epoch": 0.7869580061454421, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1917287862488438e-07, "loss": 0.0, "num_tokens": 1692537926.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 2038.04296875, "completions/mean_terminated_length": 348.66668701171875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 7.398916125297546, "epoch": 0.7872994195971321, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1880955824092473e-07, "loss": 0.0, "num_tokens": 1693658540.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 2034.703125, "completions/mean_terminated_length": 346.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.419681191444397, "epoch": 0.7876408330488222, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1844670827793898e-07, "loss": 0.0, "num_tokens": 1694778228.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1089.0, "completions/mean_length": 2027.12109375, "completions/mean_terminated_length": 266.3333435058594, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.410686254501343, "epoch": 0.7879822465005121, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1808432925131686e-07, "loss": 0.0, "num_tokens": 1695894722.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 2033.625, "completions/mean_terminated_length": 208.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.450358510017395, "epoch": 0.7883236599522021, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1772242167577973e-07, "loss": 0.0, "num_tokens": 1697011778.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1041.0, "completions/mean_length": 2035.365234375, "completions/mean_terminated_length": 430.75, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 7.3860249519348145, "epoch": 0.7886650734038921, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1736098606537867e-07, "loss": 0.0, "num_tokens": 1698128477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 2007.939453125, "completions/mean_terminated_length": 338.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.37375271320343, "epoch": 0.7890064868555821, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.170000229334949e-07, "loss": 0.0, "num_tokens": 1699232894.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 2029.958984375, "completions/mean_terminated_length": 200.60000610351562, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 7.460270166397095, "epoch": 0.7893479003072721, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1663953279283828e-07, "loss": 0.0, "num_tokens": 1700354889.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1799.0, "completions/mean_length": 2038.6015625, "completions/mean_terminated_length": 845.0, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 7.450420379638672, "epoch": 0.7896893137589621, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1627951615544716e-07, "loss": 0.0, "num_tokens": 1701480045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1284.0, "completions/mean_length": 2038.951171875, "completions/mean_terminated_length": 503.66668701171875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 7.420376420021057, "epoch": 0.7900307272106522, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1591997353268666e-07, "loss": 0.0, "num_tokens": 1702599716.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1346.0, "completions/mean_length": 2036.31640625, "completions/mean_terminated_length": 851.6000366210938, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.4493526220321655, "epoch": 0.7903721406623421, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.155609054352494e-07, "loss": 0.0, "num_tokens": 1703712870.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 2040.552734375, "completions/mean_terminated_length": 141.5, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 7.446033000946045, "epoch": 0.7907135541140321, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1520231237315326e-07, "loss": 0.0, "num_tokens": 1704835249.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 2025.46875, "completions/mean_terminated_length": 125.33333587646484, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.461864948272705, "epoch": 0.7910549675657221, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1484419485574196e-07, "loss": 0.0, "num_tokens": 1705947041.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.955078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1787.0, "completions/mean_length": 1980.98828125, "completions/mean_terminated_length": 556.2608642578125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 7.377618312835693, "epoch": 0.791396381017412, "frac_reward_zero_std": 0.96875, "grad_norm": 0.04589460150334441, "learning_rate": 2.1448655339168347e-07, "loss": 0.0004, "num_tokens": 1707040683.0, "reward": 0.01513671875, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01513671875, "rewards/tag_count_reward/std": 0.08503700792789459, "step": 2318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1791.0, "completions/mean_length": 2037.361328125, "completions/mean_terminated_length": 958.6000366210938, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 7.402858376502991, "epoch": 0.7917377944691021, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.141293884889699e-07, "loss": 0.0, "num_tokens": 1708164900.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 2032.79296875, "completions/mean_terminated_length": 101.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.433088421821594, "epoch": 0.7920792079207921, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.137727006549157e-07, "loss": 0.0, "num_tokens": 1709270858.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1776.0, "completions/mean_length": 2031.53125, "completions/mean_terminated_length": 642.6666870117188, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 7.408309459686279, "epoch": 0.792420621372482, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1341649039615865e-07, "loss": 0.0, "num_tokens": 1710388778.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 2033.59765625, "completions/mean_terminated_length": 204.5, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 7.406351447105408, "epoch": 0.7927620348241721, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1306075821865737e-07, "loss": 0.0, "num_tokens": 1711506268.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1088.0, "completions/mean_length": 2031.998046875, "completions/mean_terminated_length": 409.3999938964844, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 7.412235379219055, "epoch": 0.7931034482758621, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1270550462769212e-07, "loss": 0.0, "num_tokens": 1712617467.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 2036.642578125, "completions/mean_terminated_length": 594.25, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "entropy": 7.400817155838013, "epoch": 0.793444861727552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1235073012786253e-07, "loss": 0.0, "num_tokens": 1713738548.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 2033.2421875, "completions/mean_terminated_length": 159.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.445667147636414, "epoch": 0.793786275179242, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.119964352230888e-07, "loss": 0.0, "num_tokens": 1714854784.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2044.005859375, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.437542080879211, "epoch": 0.7941276886309321, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.11642620416609e-07, "loss": 0.0, "num_tokens": 1715978659.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1322.0, "completions/mean_length": 2035.67578125, "completions/mean_terminated_length": 470.5, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 7.405765175819397, "epoch": 0.794469102082622, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1128928621097985e-07, "loss": 0.0, "num_tokens": 1717094909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 2040.05078125, "completions/mean_terminated_length": 691.3333740234375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 7.45795738697052, "epoch": 0.794810515534312, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.109364331080749e-07, "loss": 0.0, "num_tokens": 1718225479.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1276.0, "completions/mean_length": 2028.689453125, "completions/mean_terminated_length": 400.16668701171875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 7.419090509414673, "epoch": 0.7951519289860021, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.10584061609085e-07, "loss": 0.0, "num_tokens": 1719346104.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 2037.1328125, "completions/mean_terminated_length": 193.33334350585938, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 7.427059531211853, "epoch": 0.7954933424376921, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1023217221451603e-07, "loss": 0.0, "num_tokens": 1720455948.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1158.0, "completions/mean_length": 2031.693359375, "completions/mean_terminated_length": 656.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 7.339108347892761, "epoch": 0.795834755889382, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.098807654241903e-07, "loss": 0.0, "num_tokens": 1721575407.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 2032.755859375, "completions/mean_terminated_length": 96.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 7.40820586681366, "epoch": 0.796176169341072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0952984173724348e-07, "loss": 0.0, "num_tokens": 1722705426.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1062.0, "completions/mean_length": 2032.908203125, "completions/mean_terminated_length": 502.6000061035156, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 7.418184876441956, "epoch": 0.7965175827927621, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.091794016521259e-07, "loss": 0.0, "num_tokens": 1723829651.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 2037.515625, "completions/mean_terminated_length": 258.66668701171875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 7.399709224700928, "epoch": 0.796858996244452, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.088294456666002e-07, "loss": 0.0, "num_tokens": 1724948923.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1212.0, "completions/mean_length": 2033.138671875, "completions/mean_terminated_length": 526.2000122070312, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 7.40534770488739, "epoch": 0.797200409696142, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0847997427774222e-07, "loss": 0.0, "num_tokens": 1726061810.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1787.0, "completions/mean_length": 2016.4609375, "completions/mean_terminated_length": 433.20001220703125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 7.351101875305176, "epoch": 0.7975418231478321, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0813098798193875e-07, "loss": 0.0, "num_tokens": 1727173790.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1179.0, "completions/mean_length": 2033.05078125, "completions/mean_terminated_length": 517.2000122070312, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 7.362094879150391, "epoch": 0.797883236599522, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0778248727488807e-07, "loss": 0.0, "num_tokens": 1728290248.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1096.0, "completions/mean_length": 2026.59375, "completions/mean_terminated_length": 482.2857360839844, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.4384765625, "epoch": 0.798224650051212, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0743447265159849e-07, "loss": 0.0, "num_tokens": 1729404760.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1090.0, "completions/mean_length": 2033.177734375, "completions/mean_terminated_length": 530.2000122070312, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 7.421752691268921, "epoch": 0.798566063502902, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0708694460638815e-07, "loss": 0.0, "num_tokens": 1730525747.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1248.0, "completions/mean_length": 1987.935546875, "completions/mean_terminated_length": 510.3500061035156, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 7.373767733573914, "epoch": 0.798907476954592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0673990363288354e-07, "loss": 0.0, "num_tokens": 1731619618.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 2045.615234375, "completions/mean_terminated_length": 827.0, "completions/min_length": 827.0, "completions/min_terminated_length": 827.0, "entropy": 7.464748024940491, "epoch": 0.799248890406282, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0639335022401998e-07, "loss": 0.0, "num_tokens": 1732737517.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1470.0, "completions/mean_length": 2021.609375, "completions/mean_terminated_length": 546.6666870117188, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 7.457610249519348, "epoch": 0.799590303857972, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.060472848720396e-07, "loss": 0.0, "num_tokens": 1733853365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 2032.8125, "completions/mean_terminated_length": 104.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 7.414353013038635, "epoch": 0.799931717309662, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0570170806849172e-07, "loss": 0.0, "num_tokens": 1734974501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1167.0, "completions/mean_length": 2031.279296875, "completions/mean_terminated_length": 335.8000183105469, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.414556503295898, "epoch": 0.800273130761352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0535662030423163e-07, "loss": 0.0, "num_tokens": 1736093700.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1152.0, "completions/mean_length": 2038.26953125, "completions/mean_terminated_length": 387.3333435058594, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.470990180969238, "epoch": 0.800614544213042, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0501202206942008e-07, "loss": 0.0, "num_tokens": 1737219726.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 2038.189453125, "completions/mean_terminated_length": 373.66668701171875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.441462159156799, "epoch": 0.800955957664732, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0466791385352224e-07, "loss": 0.0, "num_tokens": 1738351103.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1995.208984375, "completions/mean_terminated_length": 819.4091186523438, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 7.248922944068909, "epoch": 0.801297371116422, "frac_reward_zero_std": 0.96875, "grad_norm": 0.563268869722122, "learning_rate": 2.0432429614530761e-07, "loss": 0.0039, "num_tokens": 1739450058.0, "reward": 0.01513671875, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01513671875, "rewards/tag_count_reward/std": 0.08503700792789459, "step": 2347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "entropy": 7.3818359375, "epoch": 0.801638784568112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0398116943284848e-07, "loss": 0.0, "num_tokens": 1740574426.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 2032.81640625, "completions/mean_terminated_length": 104.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 7.41812539100647, "epoch": 0.801980198019802, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0363853420352022e-07, "loss": 0.0, "num_tokens": 1741694172.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1258.0, "completions/mean_length": 2016.98828125, "completions/mean_terminated_length": 283.77777099609375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.404699802398682, "epoch": 0.802321611471492, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.032963909439999e-07, "loss": 0.0, "num_tokens": 1742800262.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 2034.736328125, "completions/mean_terminated_length": 350.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.43416440486908, "epoch": 0.802663024923182, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0295474014026592e-07, "loss": 0.0, "num_tokens": 1743927359.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 2036.740234375, "completions/mean_terminated_length": 126.33333587646484, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.365596890449524, "epoch": 0.803004438374872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0261358227759687e-07, "loss": 0.0, "num_tokens": 1745051722.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1105.0, "completions/mean_length": 2024.955078125, "completions/mean_terminated_length": 362.4285888671875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.379928231239319, "epoch": 0.8033458518265619, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0227291784057166e-07, "loss": 0.0, "num_tokens": 1746168723.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1080.0, "completions/mean_length": 2036.9453125, "completions/mean_terminated_length": 633.0, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 7.3861881494522095, "epoch": 0.803687265278252, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0193274731306777e-07, "loss": 0.0, "num_tokens": 1747294983.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.970703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 1997.134765625, "completions/mean_terminated_length": 311.8000183105469, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.3432776927948, "epoch": 0.804028678729942, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0159307117826152e-07, "loss": 0.0, "num_tokens": 1748400492.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 2032.53515625, "completions/mean_terminated_length": 68.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.375041723251343, "epoch": 0.8043700921816319, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.01253889918627e-07, "loss": 0.0, "num_tokens": 1749514638.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.966796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1337.0, "completions/mean_length": 2008.9453125, "completions/mean_terminated_length": 871.7647094726562, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 7.27467155456543, "epoch": 0.8047115056333219, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0091520401593536e-07, "loss": 0.0, "num_tokens": 1750627762.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 2034.54296875, "completions/mean_terminated_length": 325.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 7.406401515007019, "epoch": 0.805052919085012, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0057701395125392e-07, "loss": 0.0, "num_tokens": 1751757944.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 2034.80078125, "completions/mean_terminated_length": 358.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 7.427626252174377, "epoch": 0.8053943325367019, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0023932020494602e-07, "loss": 0.0, "num_tokens": 1752881282.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1090.0, "completions/mean_length": 2042.30859375, "completions/mean_terminated_length": 591.0, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 7.468341112136841, "epoch": 0.8057357459883919, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9990212325666973e-07, "loss": 0.0, "num_tokens": 1753996880.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1298.0, "completions/mean_length": 2033.025390625, "completions/mean_terminated_length": 770.1666870117188, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.462019562721252, "epoch": 0.806077159440082, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9956542358537764e-07, "loss": 0.0, "num_tokens": 1755108909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 2033.0, "completions/mean_terminated_length": 128.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.375357031822205, "epoch": 0.806418572891772, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.992292216693162e-07, "loss": 0.0, "num_tokens": 1756231693.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 2037.177734375, "completions/mean_terminated_length": 201.0, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 7.425641655921936, "epoch": 0.8067599863434619, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9889351798602454e-07, "loss": 0.0, "num_tokens": 1757352808.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1055.0, "completions/mean_length": 2042.55078125, "completions/mean_terminated_length": 653.0, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "entropy": 7.412376880645752, "epoch": 0.8071013997951519, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9855831301233414e-07, "loss": 0.0, "num_tokens": 1758468786.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1101.0, "completions/mean_length": 2036.921875, "completions/mean_terminated_length": 630.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 7.377837777137756, "epoch": 0.807442813246842, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9822360722436838e-07, "loss": 0.0, "num_tokens": 1759590586.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1138.0, "completions/mean_length": 2032.421875, "completions/mean_terminated_length": 452.8000183105469, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 7.404052734375, "epoch": 0.8077842266985319, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9788940109754114e-07, "loss": 0.0, "num_tokens": 1760712994.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1025.0, "completions/mean_length": 1993.103515625, "completions/mean_terminated_length": 642.6500244140625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 7.307990670204163, "epoch": 0.8081256401502219, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9755569510655701e-07, "loss": 0.0, "num_tokens": 1761819063.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 2032.720703125, "completions/mean_terminated_length": 483.3999938964844, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 7.335814476013184, "epoch": 0.808467053601912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9722248972541e-07, "loss": 0.0, "num_tokens": 1762943016.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 2036.501953125, "completions/mean_terminated_length": 85.66667175292969, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 7.4020267724990845, "epoch": 0.8088084670536019, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9688978542738326e-07, "loss": 0.0, "num_tokens": 1764061321.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 2034.48046875, "completions/mean_terminated_length": 317.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 7.397783637046814, "epoch": 0.8091498805052919, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9655758268504785e-07, "loss": 0.0, "num_tokens": 1765176367.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1996.248046875, "completions/mean_terminated_length": 723.1500244140625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.2759387493133545, "epoch": 0.8094912939569819, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9622588197026268e-07, "loss": 0.0, "num_tokens": 1766291070.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 2031.677734375, "completions/mean_terminated_length": 376.6000061035156, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 7.400606989860535, "epoch": 0.8098327074086719, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.958946837541734e-07, "loss": 0.0, "num_tokens": 1767412601.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 2037.896484375, "completions/mean_terminated_length": 323.66668701171875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 7.399870038032532, "epoch": 0.8101741208603619, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.955639885072121e-07, "loss": 0.0, "num_tokens": 1768538100.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1215.0, "completions/mean_length": 2030.24609375, "completions/mean_terminated_length": 749.4285888671875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 7.463273048400879, "epoch": 0.8105155343120519, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9523379669909646e-07, "loss": 0.0, "num_tokens": 1769657730.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 2040.4296875, "completions/mean_terminated_length": 110.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 7.3851025104522705, "epoch": 0.8108569477637418, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9490410879882897e-07, "loss": 0.0, "num_tokens": 1770778814.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.962890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2013.53125, "completions/mean_terminated_length": 1119.157958984375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.2593876123428345, "epoch": 0.8111983612154319, "frac_reward_zero_std": 0.96875, "grad_norm": 0.02481356964995232, "learning_rate": 1.9457492527469628e-07, "loss": 0.0007, "num_tokens": 1771889214.0, "reward": 0.01123046875, "reward_std": 0.004002714995294809, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01123046875, "rewards/tag_count_reward/std": 0.06632548570632935, "step": 2376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1033.0, "completions/mean_length": 2014.998046875, "completions/mean_terminated_length": 170.55555725097656, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.3021992444992065, "epoch": 0.8115397746671219, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9424624659426897e-07, "loss": 0.0, "num_tokens": 1773001597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 2040.654296875, "completions/mean_terminated_length": 167.5, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 7.452831387519836, "epoch": 0.8118811881188119, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9391807322440007e-07, "loss": 0.0, "num_tokens": 1774120748.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 2040.40234375, "completions/mean_terminated_length": 103.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 7.334980845451355, "epoch": 0.8122226015705019, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9359040563122522e-07, "loss": 0.0, "num_tokens": 1775241674.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "entropy": 7.4248046875, "epoch": 0.8125640150221919, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.932632442801616e-07, "loss": 0.0, "num_tokens": 1776363898.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 2041.029296875, "completions/mean_terminated_length": 263.5, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 7.3335942029953, "epoch": 0.8129054284738819, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.929365896359074e-07, "loss": 0.0, "num_tokens": 1777485305.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1103.0, "completions/mean_length": 2038.494140625, "completions/mean_terminated_length": 425.66668701171875, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.44973087310791, "epoch": 0.8132468419255718, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9261044216244077e-07, "loss": 0.0, "num_tokens": 1778612518.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1032.0, "completions/mean_length": 2026.4140625, "completions/mean_terminated_length": 469.14288330078125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 7.46022355556488, "epoch": 0.8135882553772619, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9228480232301977e-07, "loss": 0.0, "num_tokens": 1779722602.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1092.0, "completions/mean_length": 2016.205078125, "completions/mean_terminated_length": 420.1000061035156, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.399633765220642, "epoch": 0.8139296688289519, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9195967058018125e-07, "loss": 0.0, "num_tokens": 1780832499.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1835.0, "completions/mean_length": 2037.642578125, "completions/mean_terminated_length": 722.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 7.452705144882202, "epoch": 0.8142710822806418, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9163504739574054e-07, "loss": 0.0, "num_tokens": 1781961644.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 2028.09765625, "completions/mean_terminated_length": 592.2857666015625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 7.392954587936401, "epoch": 0.8146124957323319, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9131093323079044e-07, "loss": 0.0, "num_tokens": 1783081582.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 2022.1015625, "completions/mean_terminated_length": 153.71429443359375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.451143980026245, "epoch": 0.8149539091840219, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9098732854570104e-07, "loss": 0.0, "num_tokens": 1784196738.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2039.322265625, "completions/mean_terminated_length": 1801.1666259765625, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 7.209701895713806, "epoch": 0.8152953226357118, "frac_reward_zero_std": 0.96875, "grad_norm": 0.016650904110734627, "learning_rate": 1.906642338001182e-07, "loss": 0.0021, "num_tokens": 1785326567.0, "reward": 0.01806640625, "reward_std": 0.017286352813243866, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08812850713729858, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01025390625, "rewards/tag_count_reward/std": 0.06071438640356064, "step": 2388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 2029.67578125, "completions/mean_terminated_length": 171.60000610351562, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 7.434085965156555, "epoch": 0.8156367360874018, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9034164945296415e-07, "loss": 0.0, "num_tokens": 1786443217.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 2037.419921875, "completions/mean_terminated_length": 242.33334350585938, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 7.39635705947876, "epoch": 0.8159781495390919, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9001957596243557e-07, "loss": 0.0, "num_tokens": 1787564024.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1158.0, "completions/mean_length": 2038.451171875, "completions/mean_terminated_length": 825.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.346747517585754, "epoch": 0.8163195629907818, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.896980137860038e-07, "loss": 0.0, "num_tokens": 1788687423.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1089.0, "completions/mean_length": 2027.291015625, "completions/mean_terminated_length": 533.2857666015625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 7.433058738708496, "epoch": 0.8166609764424718, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8937696338041397e-07, "loss": 0.0, "num_tokens": 1789801044.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 2028.84375, "completions/mean_terminated_length": 413.3333435058594, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 7.375016450881958, "epoch": 0.8170023898941619, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.890564252016843e-07, "loss": 0.0, "num_tokens": 1790914356.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 2044.01953125, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.425445914268494, "epoch": 0.8173438033458518, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.887363997051051e-07, "loss": 0.0, "num_tokens": 1792033630.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 2040.58984375, "completions/mean_terminated_length": 151.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 7.385557651519775, "epoch": 0.8176852167975418, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8841688734523898e-07, "loss": 0.0, "num_tokens": 1793162940.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 2031.18359375, "completions/mean_terminated_length": 613.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.407717943191528, "epoch": 0.8180266302492318, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8809788857591918e-07, "loss": 0.0, "num_tokens": 1794280714.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 2032.365234375, "completions/mean_terminated_length": 447.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 7.438931345939636, "epoch": 0.8183680437009219, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8777940385024983e-07, "loss": 0.0, "num_tokens": 1795396629.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1854.0, "completions/mean_length": 2037.197265625, "completions/mean_terminated_length": 665.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.43440306186676, "epoch": 0.8187094571526118, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8746143362060473e-07, "loss": 0.0, "num_tokens": 1796520122.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1888.0, "completions/mean_length": 2035.0234375, "completions/mean_terminated_length": 719.2000122070312, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 7.45437753200531, "epoch": 0.8190508706043018, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8714397833862717e-07, "loss": 0.0, "num_tokens": 1797637366.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "entropy": 7.435546875, "epoch": 0.8193922840559918, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8682703845522837e-07, "loss": 0.0, "num_tokens": 1798766726.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1327.0, "completions/mean_length": 2032.439453125, "completions/mean_terminated_length": 454.6000061035156, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.4467644691467285, "epoch": 0.8197336975076818, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.865106144205883e-07, "loss": 0.0, "num_tokens": 1799878999.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 2031.4453125, "completions/mean_terminated_length": 352.8000183105469, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 7.407930493354797, "epoch": 0.8200751109593718, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8619470668415351e-07, "loss": 0.0, "num_tokens": 1800993051.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 2022.08984375, "completions/mean_terminated_length": 389.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 7.3910263776779175, "epoch": 0.8204165244110618, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.858793156946376e-07, "loss": 0.0, "num_tokens": 1802101257.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 2025.23828125, "completions/mean_terminated_length": 383.14288330078125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.407380700111389, "epoch": 0.8207579378627518, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.855644419000202e-07, "loss": 0.0, "num_tokens": 1803212035.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1822.0, "completions/mean_length": 2029.525390625, "completions/mean_terminated_length": 471.5, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 7.436526775360107, "epoch": 0.8210993513144418, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.85250085747546e-07, "loss": 0.0, "num_tokens": 1804327248.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2007.404296875, "completions/mean_terminated_length": 1103.227294921875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.311215162277222, "epoch": 0.8214407647661318, "frac_reward_zero_std": 0.96875, "grad_norm": 0.4962796396079381, "learning_rate": 1.84936247683725e-07, "loss": 0.0091, "num_tokens": 1805436735.0, "reward": 0.02392578125, "reward_std": 0.01859063096344471, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.10772226005792618, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01220703125, "rewards/tag_count_reward/std": 0.07148420810699463, "step": 2406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 2037.328125, "completions/mean_terminated_length": 226.6666717529297, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.455205678939819, "epoch": 0.8217821782178217, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8462292815433057e-07, "loss": 0.0, "num_tokens": 1806553111.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1213.0, "completions/mean_length": 2016.64453125, "completions/mean_terminated_length": 442.6000061035156, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.40875506401062, "epoch": 0.8221235916695118, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8431012760440028e-07, "loss": 0.0, "num_tokens": 1807652881.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1362.0, "completions/mean_length": 2039.83203125, "completions/mean_terminated_length": 654.0, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 7.334492921829224, "epoch": 0.8224650051212018, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8399784647823388e-07, "loss": 0.0, "num_tokens": 1808773067.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 2037.505859375, "completions/mean_terminated_length": 257.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.456486940383911, "epoch": 0.8228064185728917, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8368608521939383e-07, "loss": 0.0, "num_tokens": 1809889742.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1059.0, "completions/mean_length": 2033.390625, "completions/mean_terminated_length": 552.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 7.333209991455078, "epoch": 0.8231478320245817, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8337484427070406e-07, "loss": 0.0, "num_tokens": 1811006070.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 2029.5625, "completions/mean_terminated_length": 474.66668701171875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.466232419013977, "epoch": 0.8234892454762718, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8306412407424938e-07, "loss": 0.0, "num_tokens": 1812115350.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1060.0, "completions/mean_length": 2029.294921875, "completions/mean_terminated_length": 451.8333435058594, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 7.415727853775024, "epoch": 0.8238306589279618, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8275392507137482e-07, "loss": 0.0, "num_tokens": 1813236813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1797.0, "completions/mean_length": 2033.322265625, "completions/mean_terminated_length": 795.5, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 7.424195051193237, "epoch": 0.8241720723796517, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8244424770268547e-07, "loss": 0.0, "num_tokens": 1814360578.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 2025.857421875, "completions/mean_terminated_length": 428.4285888671875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 7.390285491943359, "epoch": 0.8245134858313418, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.821350924080449e-07, "loss": 0.0, "num_tokens": 1815474905.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1205.0, "completions/mean_length": 2042.50390625, "completions/mean_terminated_length": 641.0, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 7.4050644636154175, "epoch": 0.8248548992830318, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.818264596265758e-07, "loss": 0.0, "num_tokens": 1816597723.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1089.0, "completions/mean_length": 2025.81640625, "completions/mean_terminated_length": 425.4285888671875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.356980085372925, "epoch": 0.8251963127347217, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8151834979665825e-07, "loss": 0.0, "num_tokens": 1817712669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "entropy": 7.3896484375, "epoch": 0.8255377261864117, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8121076335592976e-07, "loss": 0.0, "num_tokens": 1818838061.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1816.0, "completions/mean_length": 2024.8203125, "completions/mean_terminated_length": 729.3333129882812, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.433948755264282, "epoch": 0.8258791396381018, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.809037007412842e-07, "loss": 0.0, "num_tokens": 1819951089.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1146.0, "completions/mean_length": 2034.79296875, "completions/mean_terminated_length": 357.5, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 7.364455938339233, "epoch": 0.8262205530897917, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8059716238887163e-07, "loss": 0.0, "num_tokens": 1821071399.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 2030.326171875, "completions/mean_terminated_length": 238.1999969482422, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 7.4276769161224365, "epoch": 0.8265619665414817, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.802911487340972e-07, "loss": 0.0, "num_tokens": 1822186782.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 2037.822265625, "completions/mean_terminated_length": 311.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 7.391058921813965, "epoch": 0.8269033799931718, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7998566021162088e-07, "loss": 0.0, "num_tokens": 1823312995.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 2045.12109375, "completions/mean_terminated_length": 574.0, "completions/min_length": 574.0, "completions/min_terminated_length": 574.0, "entropy": 7.371769666671753, "epoch": 0.8272447934448617, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.79680697255357e-07, "loss": 0.0, "num_tokens": 1824437409.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1116.0, "completions/mean_length": 2026.12109375, "completions/mean_terminated_length": 447.71429443359375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 7.488344669342041, "epoch": 0.8275862068965517, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7937626029847312e-07, "loss": 0.0, "num_tokens": 1825553663.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 2040.345703125, "completions/mean_terminated_length": 88.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 7.392685413360596, "epoch": 0.8279276203482417, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7907234977338965e-07, "loss": 0.0, "num_tokens": 1826675280.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 2040.107421875, "completions/mean_terminated_length": 701.0, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 7.365086555480957, "epoch": 0.8282690337999317, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7876896611177938e-07, "loss": 0.0, "num_tokens": 1827798135.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1090.0, "completions/mean_length": 2023.787109375, "completions/mean_terminated_length": 498.375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 7.418040752410889, "epoch": 0.8286104472516217, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7846610974456655e-07, "loss": 0.0, "num_tokens": 1828912026.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 2030.447265625, "completions/mean_terminated_length": 764.1428833007812, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.42345118522644, "epoch": 0.8289518607033117, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.781637811019267e-07, "loss": 0.0, "num_tokens": 1830026239.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1730.0, "completions/mean_length": 2022.078125, "completions/mean_terminated_length": 573.3333129882812, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.393423557281494, "epoch": 0.8292932741550018, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7786198061328567e-07, "loss": 0.0, "num_tokens": 1831138167.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1665.0, "completions/mean_length": 2031.88671875, "completions/mean_terminated_length": 673.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.367525219917297, "epoch": 0.8296346876066917, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7756070870731921e-07, "loss": 0.0, "num_tokens": 1832253181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1647.0, "completions/mean_length": 2034.1875, "completions/mean_terminated_length": 633.6000366210938, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 7.4416184425354, "epoch": 0.8299761010583817, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7725996581195196e-07, "loss": 0.0, "num_tokens": 1833372957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 2037.416015625, "completions/mean_terminated_length": 241.6666717529297, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 7.380924701690674, "epoch": 0.8303175145100717, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7695975235435765e-07, "loss": 0.0, "num_tokens": 1834492818.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 1990.96484375, "completions/mean_terminated_length": 425.6666564941406, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.3588173389434814, "epoch": 0.8306589279617617, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.766600687609574e-07, "loss": 0.0, "num_tokens": 1835592560.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 2044.53125, "completions/mean_terminated_length": 272.0, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "entropy": 7.382876515388489, "epoch": 0.8310003414134517, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7636091545742038e-07, "loss": 0.0, "num_tokens": 1836719968.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1921.0, "completions/mean_length": 2029.091796875, "completions/mean_terminated_length": 665.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.4094765186309814, "epoch": 0.8313417548651417, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7606229286866175e-07, "loss": 0.0, "num_tokens": 1837833119.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 2040.884765625, "completions/mean_terminated_length": 226.5, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 7.482037782669067, "epoch": 0.8316831683168316, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.757642014188438e-07, "loss": 0.0, "num_tokens": 1838959556.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1041.0, "completions/mean_length": 2035.9765625, "completions/mean_terminated_length": 509.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.453561544418335, "epoch": 0.8320245817685217, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7546664153137359e-07, "loss": 0.0, "num_tokens": 1840075128.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 2036.552734375, "completions/mean_terminated_length": 94.33333587646484, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 7.408011555671692, "epoch": 0.8323659952202117, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7516961362890364e-07, "loss": 0.0, "num_tokens": 1841200051.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 2024.166015625, "completions/mean_terminated_length": 304.71429443359375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.411078453063965, "epoch": 0.8327074086719016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7487311813333038e-07, "loss": 0.0, "num_tokens": 1842318008.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1220.0, "completions/mean_length": 2039.63671875, "completions/mean_terminated_length": 620.6666870117188, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.388573169708252, "epoch": 0.8330488221235917, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7457715546579456e-07, "loss": 0.0, "num_tokens": 1843437902.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 2041.9140625, "completions/mean_terminated_length": 490.0, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 7.3910017013549805, "epoch": 0.8333902355752817, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7428172604667952e-07, "loss": 0.0, "num_tokens": 1844564930.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 2034.16796875, "completions/mean_terminated_length": 277.5, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 7.408169507980347, "epoch": 0.8337316490269716, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7398683029561185e-07, "loss": 0.0, "num_tokens": 1845682872.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1732.0, "completions/mean_length": 1979.650390625, "completions/mean_terminated_length": 457.3182067871094, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.307413101196289, "epoch": 0.8340730624786616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7369246863145948e-07, "loss": 0.0, "num_tokens": 1846786597.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1280.0, "completions/mean_length": 2038.623046875, "completions/mean_terminated_length": 447.66668701171875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 7.380413055419922, "epoch": 0.8344144759303517, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7339864147233225e-07, "loss": 0.0, "num_tokens": 1847906724.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 2019.455078125, "completions/mean_terminated_length": 221.125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.424484372138977, "epoch": 0.8347558893820417, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7310534923558025e-07, "loss": 0.0, "num_tokens": 1849014733.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1403.0, "completions/mean_length": 2025.794921875, "completions/mean_terminated_length": 423.8571472167969, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.399832844734192, "epoch": 0.8350973028337316, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7281259233779436e-07, "loss": 0.0, "num_tokens": 1850125076.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.978515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1685.0, "completions/mean_length": 2013.6171875, "completions/mean_terminated_length": 447.6363830566406, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 7.390286684036255, "epoch": 0.8354387162854217, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7252037119480438e-07, "loss": 0.0, "num_tokens": 1851231440.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 2034.171875, "completions/mean_terminated_length": 278.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 7.420837163925171, "epoch": 0.8357801297371117, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7222868622167998e-07, "loss": 0.0, "num_tokens": 1852343224.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1121.0, "completions/mean_length": 2017.951171875, "completions/mean_terminated_length": 338.5555725097656, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.339394927024841, "epoch": 0.8361215431888016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7193753783272847e-07, "loss": 0.0, "num_tokens": 1853459439.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 2031.1015625, "completions/mean_terminated_length": 317.6000061035156, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 7.428701996803284, "epoch": 0.8364629566404916, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7164692644149557e-07, "loss": 0.0, "num_tokens": 1854575795.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 2038.533203125, "completions/mean_terminated_length": 432.3333435058594, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 7.443089008331299, "epoch": 0.8368043700921817, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.713568524607637e-07, "loss": 0.0, "num_tokens": 1855690692.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 2032.69140625, "completions/mean_terminated_length": 88.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.408985257148743, "epoch": 0.8371457835438716, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.710673163025526e-07, "loss": 0.0, "num_tokens": 1856815206.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 2034.1484375, "completions/mean_terminated_length": 275.0, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 7.447895646095276, "epoch": 0.8374871969955616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.707783183781174e-07, "loss": 0.0, "num_tokens": 1857933522.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1050.0, "completions/mean_length": 2030.92578125, "completions/mean_terminated_length": 591.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.438558101654053, "epoch": 0.8378286104472517, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7048985909794928e-07, "loss": 0.0, "num_tokens": 1859047532.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 2037.236328125, "completions/mean_terminated_length": 211.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.351935625076294, "epoch": 0.8381700238989416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7020193887177403e-07, "loss": 0.0, "num_tokens": 1860172261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1139.0, "completions/mean_length": 2038.080078125, "completions/mean_terminated_length": 778.25, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 7.421736240386963, "epoch": 0.8385114373506316, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6991455810855204e-07, "loss": 0.0, "num_tokens": 1861290894.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 2037.3671875, "completions/mean_terminated_length": 233.33334350585938, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 7.421971917152405, "epoch": 0.8388528508023216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6962771721647705e-07, "loss": 0.0, "num_tokens": 1862403626.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2036.93359375, "completions/mean_terminated_length": 1764.7000732421875, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "entropy": 7.307016611099243, "epoch": 0.8391942642540116, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.693414166029764e-07, "loss": 0.0, "num_tokens": 1863524136.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1111.0, "completions/mean_length": 2030.775390625, "completions/mean_terminated_length": 284.20001220703125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.41375470161438, "epoch": 0.8395356777057016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6905565667470954e-07, "loss": 0.0, "num_tokens": 1864642069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1108.0, "completions/mean_length": 2034.80859375, "completions/mean_terminated_length": 359.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.411147475242615, "epoch": 0.8398770911573916, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6877043783756838e-07, "loss": 0.0, "num_tokens": 1865755347.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 2026.654296875, "completions/mean_terminated_length": 226.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.442112684249878, "epoch": 0.8402185046090817, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6848576049667605e-07, "loss": 0.0, "num_tokens": 1866861346.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 2040.181640625, "completions/mean_terminated_length": 46.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 7.392202615737915, "epoch": 0.8405599180607716, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6820162505638675e-07, "loss": 0.0, "num_tokens": 1867979967.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 2033.384765625, "completions/mean_terminated_length": 177.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 7.376197934150696, "epoch": 0.8409013315124616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6791803192028458e-07, "loss": 0.0, "num_tokens": 1869102148.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 2031.724609375, "completions/mean_terminated_length": 381.3999938964844, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 7.482461094856262, "epoch": 0.8412427449641516, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.676349814911837e-07, "loss": 0.0, "num_tokens": 1870217655.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 2032.892578125, "completions/mean_terminated_length": 114.25, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.401388645172119, "epoch": 0.8415841584158416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6735247417112718e-07, "loss": 0.0, "num_tokens": 1871347776.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 2025.650390625, "completions/mean_terminated_length": 776.5555419921875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 7.437988519668579, "epoch": 0.8419255718675316, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6707051036138687e-07, "loss": 0.0, "num_tokens": 1872464365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1344.0, "completions/mean_length": 2038.732421875, "completions/mean_terminated_length": 466.3333435058594, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 7.435006499290466, "epoch": 0.8422669853192216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6678909046246247e-07, "loss": 0.0, "num_tokens": 1873583060.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 2027.869140625, "completions/mean_terminated_length": 330.16668701171875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.362778902053833, "epoch": 0.8426083987709115, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6650821487408128e-07, "loss": 0.0, "num_tokens": 1874704801.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1071.0, "completions/mean_length": 2039.966796875, "completions/mean_terminated_length": 677.0, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 7.384565591812134, "epoch": 0.8429498122226016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6622788399519722e-07, "loss": 0.0, "num_tokens": 1875826768.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1293.0, "completions/mean_length": 2035.423828125, "completions/mean_terminated_length": 438.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 7.4129098653793335, "epoch": 0.8432912256742916, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6594809822399073e-07, "loss": 0.0, "num_tokens": 1876938841.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 2024.5078125, "completions/mean_terminated_length": 544.5, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 7.469834804534912, "epoch": 0.8436326391259815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6566885795786775e-07, "loss": 0.0, "num_tokens": 1878050173.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1097.0, "completions/mean_length": 2024.994140625, "completions/mean_terminated_length": 365.2857360839844, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 7.4177610874176025, "epoch": 0.8439740525776716, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.653901635934596e-07, "loss": 0.0, "num_tokens": 1879161706.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 2033.349609375, "completions/mean_terminated_length": 172.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.396811604499817, "epoch": 0.8443154660293616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6511201552662212e-07, "loss": 0.0, "num_tokens": 1880278477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1142.0, "completions/mean_length": 2039.2734375, "completions/mean_terminated_length": 558.6666870117188, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "entropy": 7.408627033233643, "epoch": 0.8446568794810515, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6483441415243538e-07, "loss": 0.0, "num_tokens": 1881395225.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 691.0, "completions/mean_length": 2030.447265625, "completions/mean_terminated_length": 250.60000610351562, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 7.385225534439087, "epoch": 0.8449982929327415, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.645573598652025e-07, "loss": 0.0, "num_tokens": 1882522430.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 2040.833984375, "completions/mean_terminated_length": 825.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 7.405104160308838, "epoch": 0.8453397063844316, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6428085305844997e-07, "loss": 0.0, "num_tokens": 1883653433.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 2044.14453125, "completions/mean_terminated_length": 74.0, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 7.37327241897583, "epoch": 0.8456811198361216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6400489412492625e-07, "loss": 0.0, "num_tokens": 1884777507.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 2041.611328125, "completions/mean_terminated_length": 412.5, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.395210027694702, "epoch": 0.8460225332878115, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6372948345660187e-07, "loss": 0.0, "num_tokens": 1885901692.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1135.0, "completions/mean_length": 2029.921875, "completions/mean_terminated_length": 505.3333435058594, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 7.399597764015198, "epoch": 0.8463639467395015, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6345462144466864e-07, "loss": 0.0, "num_tokens": 1887007524.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1191.0, "completions/mean_length": 2033.619140625, "completions/mean_terminated_length": 575.4000244140625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 7.358701109886169, "epoch": 0.8467053601911916, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6318030847953896e-07, "loss": 0.0, "num_tokens": 1888124305.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 2040.4453125, "completions/mean_terminated_length": 114.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.401957392692566, "epoch": 0.8470467736428815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6290654495084523e-07, "loss": 0.0, "num_tokens": 1889246581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 2025.357421875, "completions/mean_terminated_length": 115.83333587646484, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.3597332239151, "epoch": 0.8473881870945715, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6263333124743972e-07, "loss": 0.0, "num_tokens": 1890362716.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1550.0, "completions/mean_length": 2038.267578125, "completions/mean_terminated_length": 1051.4000244140625, "completions/min_length": 630.0, "completions/min_terminated_length": 630.0, "entropy": 7.4131247997283936, "epoch": 0.8477296005462616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6236066775739343e-07, "loss": 0.0, "num_tokens": 1891489253.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 2045.5546875, "completions/mean_terminated_length": 796.0, "completions/min_length": 796.0, "completions/min_terminated_length": 796.0, "entropy": 7.357749581336975, "epoch": 0.8480710139979515, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6208855486799602e-07, "loss": 0.0, "num_tokens": 1892615969.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1036.0, "completions/mean_length": 2023.716796875, "completions/mean_terminated_length": 271.8571472167969, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.438335657119751, "epoch": 0.8484124274496415, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6181699296575515e-07, "loss": 0.0, "num_tokens": 1893730064.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 2033.24609375, "completions/mean_terminated_length": 159.5, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 7.397306442260742, "epoch": 0.8487538409013315, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6154598243639582e-07, "loss": 0.0, "num_tokens": 1894842302.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 2036.986328125, "completions/mean_terminated_length": 168.33334350585938, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 7.38167142868042, "epoch": 0.8490952543530215, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6127552366485957e-07, "loss": 0.0, "num_tokens": 1895970855.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1083.0, "completions/mean_length": 2034.6640625, "completions/mean_terminated_length": 682.4000244140625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 7.4321959018707275, "epoch": 0.8494366678047115, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6100561703530475e-07, "loss": 0.0, "num_tokens": 1897083723.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 2034.658203125, "completions/mean_terminated_length": 340.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.425142168998718, "epoch": 0.8497780812564015, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6073626293110485e-07, "loss": 0.0, "num_tokens": 1898203228.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1122.0, "completions/mean_length": 2018.609375, "completions/mean_terminated_length": 376.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.4369224309921265, "epoch": 0.8501194947080914, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6046746173484905e-07, "loss": 0.0, "num_tokens": 1899308052.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.962890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1668.0, "completions/mean_length": 1993.59375, "completions/mean_terminated_length": 581.8947143554688, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 7.330753564834595, "epoch": 0.8504609081597815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6019921382834104e-07, "loss": 0.0, "num_tokens": 1900409764.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1796.0, "completions/mean_length": 2036.638671875, "completions/mean_terminated_length": 884.6000366210938, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 7.444831848144531, "epoch": 0.8508023216114715, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5993151959259855e-07, "loss": 0.0, "num_tokens": 1901536971.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 2036.666015625, "completions/mean_terminated_length": 887.4000244140625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 7.392350554466248, "epoch": 0.8511437350631615, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5966437940785281e-07, "loss": 0.0, "num_tokens": 1902654112.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1045.0, "completions/mean_length": 2042.04296875, "completions/mean_terminated_length": 523.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.377890586853027, "epoch": 0.8514851485148515, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5939779365354836e-07, "loss": 0.0, "num_tokens": 1903776326.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1083.0, "completions/mean_length": 2036.095703125, "completions/mean_terminated_length": 524.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.452697277069092, "epoch": 0.8518265619665415, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.591317627083419e-07, "loss": 0.0, "num_tokens": 1904890871.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1035.0, "completions/mean_length": 2032.251953125, "completions/mean_terminated_length": 435.3999938964844, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.358686566352844, "epoch": 0.8521679754182315, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5886628695010224e-07, "loss": 0.0, "num_tokens": 1906014216.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 2027.462890625, "completions/mean_terminated_length": 295.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.411273837089539, "epoch": 0.8525093888699214, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.586013667559096e-07, "loss": 0.0, "num_tokens": 1907128693.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.962890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2037.2265625, "completions/mean_terminated_length": 1757.6842041015625, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 7.203828573226929, "epoch": 0.8528508023216115, "frac_reward_zero_std": 0.96875, "grad_norm": 1.418224747139258, "learning_rate": 1.5833700250205528e-07, "loss": -0.0004, "num_tokens": 1908260857.0, "reward": 0.00048828125, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00048828125, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 2040.251953125, "completions/mean_terminated_length": 64.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.339389085769653, "epoch": 0.8531922157733015, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5807319456404054e-07, "loss": 0.0, "num_tokens": 1909387546.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.951171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 1968.591796875, "completions/mean_terminated_length": 421.7200012207031, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 7.330472350120544, "epoch": 0.8535336292249914, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5780994331657667e-07, "loss": 0.0, "num_tokens": 1910480889.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1313.0, "completions/mean_length": 2041.255859375, "completions/mean_terminated_length": 897.0, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 7.399523854255676, "epoch": 0.8538750426766815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5754724913358417e-07, "loss": 0.0, "num_tokens": 1911602716.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 2034.583984375, "completions/mean_terminated_length": 330.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.401443004608154, "epoch": 0.8542164561283715, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5728511238819235e-07, "loss": 0.0, "num_tokens": 1912724055.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1115.0, "completions/mean_length": 2030.759765625, "completions/mean_terminated_length": 282.6000061035156, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.416879892349243, "epoch": 0.8545578695800614, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5702353345273876e-07, "loss": 0.0, "num_tokens": 1913835964.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 2014.994140625, "completions/mean_terminated_length": 170.3333282470703, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.445734143257141, "epoch": 0.8548992830317514, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.567625126987686e-07, "loss": 0.0, "num_tokens": 1914936985.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1046.0, "completions/mean_length": 2031.49609375, "completions/mean_terminated_length": 358.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 7.458443522453308, "epoch": 0.8552406964834415, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5650205049703417e-07, "loss": 0.0, "num_tokens": 1916046967.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 2033.01171875, "completions/mean_terminated_length": 129.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.40440821647644, "epoch": 0.8555821099351314, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5624214721749454e-07, "loss": 0.0, "num_tokens": 1917165485.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 2031.203125, "completions/mean_terminated_length": 328.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 7.451766848564148, "epoch": 0.8559235233868214, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.559828032293147e-07, "loss": 0.0, "num_tokens": 1918284597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1026.0, "completions/mean_length": 2029.333984375, "completions/mean_terminated_length": 455.16668701171875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.402328372001648, "epoch": 0.8562649368385115, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5572401890086537e-07, "loss": 0.0, "num_tokens": 1919401488.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 2037.015625, "completions/mean_terminated_length": 173.33334350585938, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 7.436187148094177, "epoch": 0.8566063502902014, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5546579459972237e-07, "loss": 0.0, "num_tokens": 1920521832.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2040.23046875, "completions/mean_terminated_length": 1827.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 7.188106298446655, "epoch": 0.8569477637418914, "frac_reward_zero_std": 0.96875, "grad_norm": 0.9872857572459777, "learning_rate": 1.5520813069266605e-07, "loss": -0.0, "num_tokens": 1921649886.0, "reward": 0.00048828125, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00048828125, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 2033.0, "completions/mean_terminated_length": 128.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.384153485298157, "epoch": 0.8572891771935814, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.549510275456805e-07, "loss": 0.0, "num_tokens": 1922779422.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 2038.228515625, "completions/mean_terminated_length": 380.3333435058594, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 7.481388688087463, "epoch": 0.8576305906452715, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5469448552395384e-07, "loss": 0.0, "num_tokens": 1923896211.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1030.0, "completions/mean_length": 2033.154296875, "completions/mean_terminated_length": 527.7999877929688, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.439296722412109, "epoch": 0.8579720040969614, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5443850499187656e-07, "loss": 0.0, "num_tokens": 1925017858.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1063.0, "completions/mean_length": 2037.189453125, "completions/mean_terminated_length": 664.25, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.4586169719696045, "epoch": 0.8583134175486514, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.54183086313042e-07, "loss": 0.0, "num_tokens": 1926142243.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1035.0, "completions/mean_length": 2036.03515625, "completions/mean_terminated_length": 516.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 7.476733326911926, "epoch": 0.8586548310003415, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.539282298502454e-07, "loss": 0.0, "num_tokens": 1927260885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 2040.5625, "completions/mean_terminated_length": 778.6666870117188, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 7.444947957992554, "epoch": 0.8589962444520314, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5367393596548355e-07, "loss": 0.0, "num_tokens": 1928384261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1773.0, "completions/mean_length": 2037.423828125, "completions/mean_terminated_length": 694.25, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 7.468030571937561, "epoch": 0.8593376579037214, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5342020501995375e-07, "loss": 0.0, "num_tokens": 1929502846.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1122.0, "completions/mean_length": 2028.43359375, "completions/mean_terminated_length": 378.3333435058594, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 7.42153799533844, "epoch": 0.8596790713554114, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5316703737405416e-07, "loss": 0.0, "num_tokens": 1930621708.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2035.435546875, "completions/mean_terminated_length": 1726.3499755859375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.430118680000305, "epoch": 0.8600204848071014, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5291443338738242e-07, "loss": 0.0, "num_tokens": 1931747707.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 2034.474609375, "completions/mean_terminated_length": 316.75, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 7.4723639488220215, "epoch": 0.8603618982587914, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.526623934187359e-07, "loss": 0.0, "num_tokens": 1932867982.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1784.0, "completions/mean_length": 2037.90625, "completions/mean_terminated_length": 1014.4000244140625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 7.445160627365112, "epoch": 0.8607033117104814, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.524109178261106e-07, "loss": 0.0, "num_tokens": 1933991854.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 1996.65625, "completions/mean_terminated_length": 587.5555419921875, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.3825763463974, "epoch": 0.8610447251621713, "frac_reward_zero_std": 0.96875, "grad_norm": 0.024076790610000304, "learning_rate": 1.521600069667012e-07, "loss": 0.001, "num_tokens": 1935095470.0, "reward": 0.033203125, "reward_std": 0.016010859981179237, "rewards/accuracy_reward/mean": 0.017578125, "rewards/accuracy_reward/std": 0.13154059648513794, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1086.0, "completions/mean_length": 2022.39453125, "completions/mean_terminated_length": 409.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 7.4264174699783325, "epoch": 0.8613861386138614, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5190966119689977e-07, "loss": 0.0, "num_tokens": 1936207480.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 2044.52734375, "completions/mean_terminated_length": 270.0, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "entropy": 7.463778257369995, "epoch": 0.8617275520655514, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.516598808722962e-07, "loss": 0.0, "num_tokens": 1937334374.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 2032.310546875, "completions/mean_terminated_length": 441.3999938964844, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.424929857254028, "epoch": 0.8620689655172413, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.514106663476768e-07, "loss": 0.0, "num_tokens": 1938448069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1998.3203125, "completions/mean_terminated_length": 836.7619018554688, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 7.347361445426941, "epoch": 0.8624103789689314, "frac_reward_zero_std": 0.96875, "grad_norm": 0.3488257787205506, "learning_rate": 1.5116201797702455e-07, "loss": 0.008, "num_tokens": 1939551497.0, "reward": 0.0146484375, "reward_std": 0.002668476663529873, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0146484375, "rewards/tag_count_reward/std": 0.08293935656547546, "step": 2526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1092.0, "completions/mean_length": 2035.9296875, "completions/mean_terminated_length": 503.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.456102728843689, "epoch": 0.8627517924206214, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5091393611351817e-07, "loss": 0.0, "num_tokens": 1940674965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1089.0, "completions/mean_length": 2033.16796875, "completions/mean_terminated_length": 529.2000122070312, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 7.407387971878052, "epoch": 0.8630932058723114, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5066642110953168e-07, "loss": 0.0, "num_tokens": 1941811659.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1183.0, "completions/mean_length": 2021.564453125, "completions/mean_terminated_length": 544.1111450195312, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 7.476672291755676, "epoch": 0.8634346193240013, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5041947331663385e-07, "loss": 0.0, "num_tokens": 1942922876.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 2030.427734375, "completions/mean_terminated_length": 248.60000610351562, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.5203739404678345, "epoch": 0.8637760327756914, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5017309308558804e-07, "loss": 0.0, "num_tokens": 1944034279.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 2037.296875, "completions/mean_terminated_length": 221.33334350585938, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 7.453924775123596, "epoch": 0.8641174462273814, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.499272807663511e-07, "loss": 0.0, "num_tokens": 1945152703.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 2028.794921875, "completions/mean_terminated_length": 409.16668701171875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 7.425845384597778, "epoch": 0.8644588596790713, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4968203670807367e-07, "loss": 0.0, "num_tokens": 1946272438.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 2032.361328125, "completions/mean_terminated_length": 46.25, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.429221510887146, "epoch": 0.8648002731307614, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4943736125909862e-07, "loss": 0.0, "num_tokens": 1947385247.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 2042.21875, "completions/mean_terminated_length": 1061.3333740234375, "completions/min_length": 521.0, "completions/min_terminated_length": 521.0, "entropy": 7.44893205165863, "epoch": 0.8651416865824514, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4919325476696197e-07, "loss": 0.0, "num_tokens": 1948509231.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1220.0, "completions/mean_length": 2031.5234375, "completions/mean_terminated_length": 360.8000183105469, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.470082998275757, "epoch": 0.8654831000341413, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4894971757839084e-07, "loss": 0.0, "num_tokens": 1949636251.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 2042.3515625, "completions/mean_terminated_length": 602.0, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "entropy": 7.510910272598267, "epoch": 0.8658245134858313, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.487067500393041e-07, "loss": 0.0, "num_tokens": 1950759439.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1353.0, "completions/mean_length": 2033.580078125, "completions/mean_terminated_length": 571.4000244140625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 7.4548022747039795, "epoch": 0.8661659269375214, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4846435249481159e-07, "loss": 0.0, "num_tokens": 1951880696.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 2033.578125, "completions/mean_terminated_length": 817.3333740234375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 7.471261501312256, "epoch": 0.8665073403892113, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4822252528921314e-07, "loss": 0.0, "num_tokens": 1953001248.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 2038.33203125, "completions/mean_terminated_length": 398.0, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 7.469312310218811, "epoch": 0.8668487538409013, "frac_reward_zero_std": 0.96875, "grad_norm": 0.1364049745414366, "learning_rate": 1.479812687659988e-07, "loss": 0.0, "num_tokens": 1954123082.0, "reward": 0.00048828125, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00048828125, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 2020.158203125, "completions/mean_terminated_length": 266.125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 7.477178335189819, "epoch": 0.8671901672925914, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4774058326784793e-07, "loss": 0.0, "num_tokens": 1955232475.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 2029.931640625, "completions/mean_terminated_length": 197.8000030517578, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.425842523574829, "epoch": 0.8675315807442813, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4750046913662893e-07, "loss": 0.0, "num_tokens": 1956353608.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.923828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1961.666015625, "completions/mean_terminated_length": 914.5897827148438, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.154355525970459, "epoch": 0.8678729941959713, "frac_reward_zero_std": 0.96875, "grad_norm": 0.6606900089277535, "learning_rate": 1.472609267133983e-07, "loss": 0.0052, "num_tokens": 1957446413.0, "reward": 0.02880859375, "reward_std": 0.003739949781447649, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.02880859375, "rewards/tag_count_reward/std": 0.11397096514701843, "step": 2542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 2020.9453125, "completions/mean_terminated_length": 316.5, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.419232368469238, "epoch": 0.8682144076476613, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4702195633840086e-07, "loss": 0.0, "num_tokens": 1958568593.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 2041.4375, "completions/mean_terminated_length": 368.0, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 7.4704577922821045, "epoch": 0.8685558210993514, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4678355835106862e-07, "loss": 0.0, "num_tokens": 1959695777.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1320.0, "completions/mean_length": 2035.166015625, "completions/mean_terminated_length": 405.25, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.484990119934082, "epoch": 0.8688972345510413, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4654573309002081e-07, "loss": 0.0, "num_tokens": 1960808902.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 2034.029296875, "completions/mean_terminated_length": 259.75, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 7.48026168346405, "epoch": 0.8692386480027313, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4630848089306282e-07, "loss": 0.0, "num_tokens": 1961936901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1087.0, "completions/mean_length": 2032.369140625, "completions/mean_terminated_length": 447.3999938964844, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 7.480897665023804, "epoch": 0.8695800614544213, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4607180209718666e-07, "loss": 0.0, "num_tokens": 1963057698.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 2036.701171875, "completions/mean_terminated_length": 119.66667175292969, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.459155797958374, "epoch": 0.8699214749061113, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.458356970385692e-07, "loss": 0.0, "num_tokens": 1964179193.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1686.0, "completions/mean_length": 2028.16796875, "completions/mean_terminated_length": 778.75, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 7.441940426826477, "epoch": 0.8702628883578013, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4560016605257285e-07, "loss": 0.0, "num_tokens": 1965301327.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 109.0, "completions/mean_length": 2040.296875, "completions/mean_terminated_length": 76.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 7.5817482471466064, "epoch": 0.8706043018094913, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4536520947374438e-07, "loss": 0.0, "num_tokens": 1966425943.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 2020.181640625, "completions/mean_terminated_length": 267.625, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.423120498657227, "epoch": 0.8709457152611813, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4513082763581479e-07, "loss": 0.0, "num_tokens": 1967524788.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1462.0, "completions/mean_length": 2004.33984375, "completions/mean_terminated_length": 806.1111450195312, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.336677074432373, "epoch": 0.8712871287128713, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4489702087169846e-07, "loss": 0.0, "num_tokens": 1968634226.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1050.0, "completions/mean_length": 2024.59375, "completions/mean_terminated_length": 336.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.438772678375244, "epoch": 0.8716285421645613, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4466378951349356e-07, "loss": 0.0, "num_tokens": 1969748610.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 2045.115234375, "completions/mean_terminated_length": 571.0, "completions/min_length": 571.0, "completions/min_terminated_length": 571.0, "entropy": 7.436383247375488, "epoch": 0.8719699556162512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4443113389248027e-07, "loss": 0.0, "num_tokens": 1970879389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1124.0, "completions/mean_length": 2030.89453125, "completions/mean_terminated_length": 296.3999938964844, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.446506023406982, "epoch": 0.8723113690679413, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4419905433912138e-07, "loss": 0.0, "num_tokens": 1972001495.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1125.0, "completions/mean_length": 2040.861328125, "completions/mean_terminated_length": 829.6666870117188, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "entropy": 7.482445597648621, "epoch": 0.8726527825196313, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.439675511830612e-07, "loss": 0.0, "num_tokens": 1973132736.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1038.0, "completions/mean_length": 2031.6640625, "completions/mean_terminated_length": 654.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 7.447680473327637, "epoch": 0.8729941959713212, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4373662475312574e-07, "loss": 0.0, "num_tokens": 1974246532.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.955078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 1990.6953125, "completions/mean_terminated_length": 772.3478393554688, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 7.351280450820923, "epoch": 0.8733356094230112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4350627537732113e-07, "loss": 0.0, "num_tokens": 1975342968.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 2036.587890625, "completions/mean_terminated_length": 100.33333587646484, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.437984347343445, "epoch": 0.8736770228747013, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.432765033828347e-07, "loss": 0.0, "num_tokens": 1976462821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 2024.658203125, "completions/mean_terminated_length": 340.71429443359375, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 7.418626427650452, "epoch": 0.8740184363263913, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4304730909603305e-07, "loss": 0.0, "num_tokens": 1977580646.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1202.0, "completions/mean_length": 2031.974609375, "completions/mean_terminated_length": 680.5, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 7.493143439292908, "epoch": 0.8743598497780812, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.428186928424625e-07, "loss": 0.0, "num_tokens": 1978693241.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 2041.052734375, "completions/mean_terminated_length": 269.5, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "entropy": 7.490296363830566, "epoch": 0.8747012632297713, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.425906549468481e-07, "loss": 0.0, "num_tokens": 1979816836.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1218.0, "completions/mean_length": 2028.18359375, "completions/mean_terminated_length": 357.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.360341787338257, "epoch": 0.8750426766814613, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4236319573309374e-07, "loss": 0.0, "num_tokens": 1980933970.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2004.271484375, "completions/mean_terminated_length": 981.857177734375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 7.329301238059998, "epoch": 0.8753840901331512, "frac_reward_zero_std": 0.96875, "grad_norm": 1.3646533455797767, "learning_rate": 1.421363155242809e-07, "loss": 0.0101, "num_tokens": 1982050989.0, "reward": 0.02783203125, "reward_std": 0.019757801666855812, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12414088100194931, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01220703125, "rewards/tag_count_reward/std": 0.07148420810699463, "step": 2564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 2033.7734375, "completions/mean_terminated_length": 227.0, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 7.452273607254028, "epoch": 0.8757255035848412, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4191001464266915e-07, "loss": 0.0, "num_tokens": 1983168649.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1128.0, "completions/mean_length": 2016.3046875, "completions/mean_terminated_length": 425.20001220703125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.450464367866516, "epoch": 0.8760669170365313, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4168429340969485e-07, "loss": 0.0, "num_tokens": 1984281573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 2031.361328125, "completions/mean_terminated_length": 344.20001220703125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 7.49022102355957, "epoch": 0.8764083304882212, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4145915214597114e-07, "loss": 0.0, "num_tokens": 1985400510.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1612.0, "completions/mean_length": 2026.376953125, "completions/mean_terminated_length": 466.4285888671875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.415441989898682, "epoch": 0.8767497439399112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4123459117128738e-07, "loss": 0.0, "num_tokens": 1986510447.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 2032.5234375, "completions/mean_terminated_length": 67.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.429630637168884, "epoch": 0.8770911573916013, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4101061080460862e-07, "loss": 0.0, "num_tokens": 1987622091.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 2037.95703125, "completions/mean_terminated_length": 334.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.418896198272705, "epoch": 0.8774325708432912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4078721136407525e-07, "loss": 0.0, "num_tokens": 1988749189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 2027.359375, "completions/mean_terminated_length": 286.66668701171875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 7.439982175827026, "epoch": 0.8777739842949812, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4056439316700256e-07, "loss": 0.0, "num_tokens": 1989856477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2034.21484375, "completions/mean_terminated_length": 1711.90478515625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 7.18589174747467, "epoch": 0.8781153977466712, "frac_reward_zero_std": 0.96875, "grad_norm": 0.14481342565517172, "learning_rate": 1.4034215652988026e-07, "loss": 0.0003, "num_tokens": 1990982315.0, "reward": 0.0087890625, "reward_std": 0.002668476663529873, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0087890625, "rewards/tag_count_reward/std": 0.051121458411216736, "step": 2572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 2030.30859375, "completions/mean_terminated_length": 236.40000915527344, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 7.425808668136597, "epoch": 0.8784568111983612, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4012050176837205e-07, "loss": 0.0, "num_tokens": 1992098889.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1314.0, "completions/mean_length": 1993.509765625, "completions/mean_terminated_length": 719.4761962890625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.362883806228638, "epoch": 0.8787982246500512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3989942919731484e-07, "loss": 0.0, "num_tokens": 1993199550.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1356.0, "completions/mean_length": 2025.90625, "completions/mean_terminated_length": 432.0000305175781, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 7.508161425590515, "epoch": 0.8791396381017412, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3967893913071898e-07, "loss": 0.0, "num_tokens": 1994314046.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1711.0, "completions/mean_length": 2028.62890625, "completions/mean_terminated_length": 631.1428833007812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 7.436893582344055, "epoch": 0.8794810515534313, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3945903188176719e-07, "loss": 0.0, "num_tokens": 1995426400.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 2038.59765625, "completions/mean_terminated_length": 443.3333435058594, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 7.460209369659424, "epoch": 0.8798224650051212, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3923970776281452e-07, "loss": 0.0, "num_tokens": 1996546178.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 1989.888671875, "completions/mean_terminated_length": 560.3500366210938, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.378965497016907, "epoch": 0.8801638784568112, "frac_reward_zero_std": 0.96875, "grad_norm": 0.05467025816587016, "learning_rate": 1.3902096708538762e-07, "loss": -0.0018, "num_tokens": 1997655529.0, "reward": 0.021484375, "reward_std": 0.012597277760505676, "rewards/accuracy_reward/mean": 0.005859375, "rewards/accuracy_reward/std": 0.07639661431312561, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 2023.56640625, "completions/mean_terminated_length": 260.8571472167969, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.512592554092407, "epoch": 0.8805052919085012, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3880281016018455e-07, "loss": 0.0, "num_tokens": 1998768539.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 2044.0546875, "completions/mean_terminated_length": 28.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 7.488916039466858, "epoch": 0.8808467053601912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3858523729707402e-07, "loss": 0.0, "num_tokens": 1999885863.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 2005.7578125, "completions/mean_terminated_length": 966.6000366210938, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 7.264840126037598, "epoch": 0.8811881188118812, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3836824880509543e-07, "loss": 0.0, "num_tokens": 2000998859.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1781.0, "completions/mean_length": 2036.33984375, "completions/mean_terminated_length": 854.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 7.469657778739929, "epoch": 0.8815295322635712, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3815184499245775e-07, "loss": 0.0, "num_tokens": 2002124233.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 1980.2890625, "completions/mean_terminated_length": 397.1428527832031, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.409211158752441, "epoch": 0.8818709457152611, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3793602616653977e-07, "loss": 0.0, "num_tokens": 2003220813.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 2035.6015625, "completions/mean_terminated_length": 461.0, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 7.469872236251831, "epoch": 0.8822123591669512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.377207926338894e-07, "loss": 0.0, "num_tokens": 2004339521.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1072.0, "completions/mean_length": 2028.931640625, "completions/mean_terminated_length": 420.8333435058594, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 7.500434875488281, "epoch": 0.8825537726186412, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.37506144700223e-07, "loss": 0.0, "num_tokens": 2005449582.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1076.0, "completions/mean_length": 2028.845703125, "completions/mean_terminated_length": 413.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 7.460627794265747, "epoch": 0.8828951860703311, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3729208267042524e-07, "loss": 0.0, "num_tokens": 2006562159.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 2028.7890625, "completions/mean_terminated_length": 642.857177734375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.442204236984253, "epoch": 0.8832365995220212, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3707860684854872e-07, "loss": 0.0, "num_tokens": 2007677459.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1075.0, "completions/mean_length": 2032.736328125, "completions/mean_terminated_length": 485.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.530783176422119, "epoch": 0.8835780129737112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3686571753781302e-07, "loss": 0.0, "num_tokens": 2008792332.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1072.0, "completions/mean_length": 2027.3359375, "completions/mean_terminated_length": 284.66668701171875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.437960624694824, "epoch": 0.8839194264254011, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.36653415040605e-07, "loss": 0.0, "num_tokens": 2009913496.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 1991.63671875, "completions/mean_terminated_length": 605.1000366210938, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.376985311508179, "epoch": 0.8842608398770911, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3644169965847787e-07, "loss": 0.0, "num_tokens": 2011014590.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1136.0, "completions/mean_length": 2038.224609375, "completions/mean_terminated_length": 379.66668701171875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.4448747634887695, "epoch": 0.8846022533287812, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3623057169215102e-07, "loss": 0.0, "num_tokens": 2012127633.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 2036.67578125, "completions/mean_terminated_length": 115.33333587646484, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 7.472687721252441, "epoch": 0.8849436667804712, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3602003144150926e-07, "loss": 0.0, "num_tokens": 2013239691.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1041.0, "completions/mean_length": 2032.93359375, "completions/mean_terminated_length": 505.20001220703125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 7.3965723514556885, "epoch": 0.8852850802321611, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3581007920560281e-07, "loss": 0.0, "num_tokens": 2014356377.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1818.0, "completions/mean_length": 2032.849609375, "completions/mean_terminated_length": 496.6000061035156, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 7.419658184051514, "epoch": 0.8856264936838512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3560071528264653e-07, "loss": 0.0, "num_tokens": 2015470764.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 2040.830078125, "completions/mean_terminated_length": 212.5, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 7.442207098007202, "epoch": 0.8859679071355412, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3539193997001976e-07, "loss": 0.0, "num_tokens": 2016584821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1088.0, "completions/mean_length": 2040.25390625, "completions/mean_terminated_length": 726.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 7.451852202415466, "epoch": 0.8863093205872311, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.351837535642657e-07, "loss": 0.0, "num_tokens": 2017702055.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 2037.888671875, "completions/mean_terminated_length": 322.3333435058594, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 7.396289706230164, "epoch": 0.8866507340389211, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3497615636109124e-07, "loss": 0.0, "num_tokens": 2018822126.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 2033.8671875, "completions/mean_terminated_length": 239.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.447802543640137, "epoch": 0.8869921474906112, "frac_reward_zero_std": 0.96875, "grad_norm": 0.2290131000391159, "learning_rate": 1.3476914865536608e-07, "loss": 0.0, "num_tokens": 2019938186.0, "reward": 0.00048828125, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00048828125, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1052.0, "completions/mean_length": 2039.0, "completions/mean_terminated_length": 512.0, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 7.419032096862793, "epoch": 0.8873335609423011, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3456273074112287e-07, "loss": 0.0, "num_tokens": 2021060138.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1174.0, "completions/mean_length": 2036.89453125, "completions/mean_terminated_length": 626.5, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 7.365715861320496, "epoch": 0.8876749743939911, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3435690291155627e-07, "loss": 0.0, "num_tokens": 2022176484.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 2044.072265625, "completions/mean_terminated_length": 37.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 7.440275430679321, "epoch": 0.8880163878456812, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.341516654590231e-07, "loss": 0.0, "num_tokens": 2023296489.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1871.0, "completions/mean_length": 2026.646484375, "completions/mean_terminated_length": 486.14288330078125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.419044733047485, "epoch": 0.8883578012973711, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.339470186750413e-07, "loss": 0.0, "num_tokens": 2024409172.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1214.0, "completions/mean_length": 2043.380859375, "completions/mean_terminated_length": 865.5, "completions/min_length": 517.0, "completions/min_terminated_length": 517.0, "entropy": 7.352962017059326, "epoch": 0.8886992147490611, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3374296285029014e-07, "loss": 0.0, "num_tokens": 2025530007.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 2045.6015625, "completions/mean_terminated_length": 820.0, "completions/min_length": 820.0, "completions/min_terminated_length": 820.0, "entropy": 7.457139372825623, "epoch": 0.8890406282007511, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.335394982746091e-07, "loss": 0.0, "num_tokens": 2026649483.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1300.0, "completions/mean_length": 2040.439453125, "completions/mean_terminated_length": 757.6666870117188, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 7.457604646682739, "epoch": 0.8893820416524411, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.333366252369983e-07, "loss": 0.0, "num_tokens": 2027769132.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1171.0, "completions/mean_length": 2040.171875, "completions/mean_terminated_length": 712.0, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 7.384695529937744, "epoch": 0.8897234551041311, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3313434402561725e-07, "loss": 0.0, "num_tokens": 2028885908.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1085.0, "completions/mean_length": 2036.771484375, "completions/mean_terminated_length": 610.75, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 7.398870348930359, "epoch": 0.8900648685558211, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3293265492778502e-07, "loss": 0.0, "num_tokens": 2030002783.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 2040.638671875, "completions/mean_terminated_length": 163.5, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 7.312588334083557, "epoch": 0.890406282007511, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3273155822997975e-07, "loss": 0.0, "num_tokens": 2031124854.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 2044.923828125, "completions/mean_terminated_length": 473.0, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "entropy": 7.393906950950623, "epoch": 0.8907476954592011, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3253105421783794e-07, "loss": 0.0, "num_tokens": 2032245439.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1207.0, "completions/mean_length": 2038.64453125, "completions/mean_terminated_length": 451.3333435058594, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 7.3939385414123535, "epoch": 0.8910891089108911, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3233114317615436e-07, "loss": 0.0, "num_tokens": 2033362953.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1373.0, "completions/mean_length": 2028.505859375, "completions/mean_terminated_length": 384.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.30047070980072, "epoch": 0.8914305223625811, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3213182538888146e-07, "loss": 0.0, "num_tokens": 2034484236.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1062.0, "completions/mean_length": 2038.529296875, "completions/mean_terminated_length": 431.66668701171875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.390876054763794, "epoch": 0.891771935814271, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.31933101139129e-07, "loss": 0.0, "num_tokens": 2035600507.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 2044.03515625, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 7.300321578979492, "epoch": 0.8921133492659611, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.317349707091638e-07, "loss": 0.0, "num_tokens": 2036730605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1052.0, "completions/mean_length": 2038.728515625, "completions/mean_terminated_length": 465.66668701171875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 7.34476363658905, "epoch": 0.8924547627176511, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.315374343804092e-07, "loss": 0.0, "num_tokens": 2037850114.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1138.0, "completions/mean_length": 2032.01171875, "completions/mean_terminated_length": 410.8000183105469, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 7.304595232009888, "epoch": 0.892796176169341, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.313404924334447e-07, "loss": 0.0, "num_tokens": 2038966744.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1855.0, "completions/mean_length": 2036.216796875, "completions/mean_terminated_length": 539.75, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.315793037414551, "epoch": 0.8931375896210311, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3114414514800532e-07, "loss": 0.0, "num_tokens": 2040092391.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 2034.21875, "completions/mean_terminated_length": 284.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 7.269364356994629, "epoch": 0.8934790030727211, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3094839280298182e-07, "loss": 0.0, "num_tokens": 2041208743.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1907.0, "completions/mean_length": 2032.263671875, "completions/mean_terminated_length": 436.6000061035156, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.2764153480529785, "epoch": 0.893820416524411, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3075323567641945e-07, "loss": 0.0, "num_tokens": 2042329054.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1769.0, "completions/mean_length": 2033.03515625, "completions/mean_terminated_length": 771.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 7.253867268562317, "epoch": 0.894161829976101, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.305586740455184e-07, "loss": 0.0, "num_tokens": 2043451584.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2039.26171875, "completions/mean_terminated_length": 1799.4444580078125, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "entropy": 7.074024438858032, "epoch": 0.8945032434277911, "frac_reward_zero_std": 0.96875, "grad_norm": 0.46035569310634966, "learning_rate": 1.3036470818663282e-07, "loss": 0.0004, "num_tokens": 2044579030.0, "reward": 0.0146484375, "reward_std": 0.013033847324550152, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.06243881583213806, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0107421875, "rewards/tag_count_reward/std": 0.06358373910188675, "step": 2620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 2039.23046875, "completions/mean_terminated_length": 551.3333740234375, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "entropy": 7.280572295188904, "epoch": 0.894844656879481, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3017133837527082e-07, "loss": 0.0, "num_tokens": 2045702828.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "entropy": 7.33203125, "epoch": 0.895186070331171, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.299785648860936e-07, "loss": 0.0, "num_tokens": 2046835500.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 2032.232421875, "completions/mean_terminated_length": 29.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.3431172370910645, "epoch": 0.8955274837828611, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2978638799291557e-07, "loss": 0.0, "num_tokens": 2047952131.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 2037.10546875, "completions/mean_terminated_length": 653.5, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 7.2538957595825195, "epoch": 0.895868897234551, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2959480796870362e-07, "loss": 0.0, "num_tokens": 2049069273.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1129.0, "completions/mean_length": 2032.265625, "completions/mean_terminated_length": 436.8000183105469, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.3531869649887085, "epoch": 0.896210310686241, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2940382508557692e-07, "loss": 0.0, "num_tokens": 2050186753.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1341.0, "completions/mean_length": 2033.564453125, "completions/mean_terminated_length": 569.7999877929688, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 7.272175192832947, "epoch": 0.896551724137931, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.292134396148065e-07, "loss": 0.0, "num_tokens": 2051311730.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1158.0, "completions/mean_length": 2020.986328125, "completions/mean_terminated_length": 319.125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.270734190940857, "epoch": 0.8968931375896211, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2902365182681476e-07, "loss": 0.0, "num_tokens": 2052421339.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1085.0, "completions/mean_length": 2036.755859375, "completions/mean_terminated_length": 608.75, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 7.3394694328308105, "epoch": 0.897234551041311, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2883446199117506e-07, "loss": 0.0, "num_tokens": 2053537726.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 2024.916015625, "completions/mean_terminated_length": 359.5714416503906, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.406259298324585, "epoch": 0.897575964493001, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.286458703766117e-07, "loss": 0.0, "num_tokens": 2054645555.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1296.0, "completions/mean_length": 2027.5546875, "completions/mean_terminated_length": 552.5714721679688, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.259608626365662, "epoch": 0.8979173779446911, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2845787725099897e-07, "loss": 0.0, "num_tokens": 2055767423.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1112.0, "completions/mean_length": 2035.501953125, "completions/mean_terminated_length": 448.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.357897877693176, "epoch": 0.898258791396381, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2827048288136126e-07, "loss": 0.0, "num_tokens": 2056884992.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 2027.80859375, "completions/mean_terminated_length": 325.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.273398399353027, "epoch": 0.898600204848071, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2808368753387248e-07, "loss": 0.0, "num_tokens": 2058002494.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1126.0, "completions/mean_length": 2038.34375, "completions/mean_terminated_length": 400.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 7.301588654518127, "epoch": 0.898941618299761, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2789749147385562e-07, "loss": 0.0, "num_tokens": 2059124446.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 2036.869140625, "completions/mean_terminated_length": 148.33334350585938, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.3318222761154175, "epoch": 0.899283031751451, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2771189496578248e-07, "loss": 0.0, "num_tokens": 2060242523.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 2034.056640625, "completions/mean_terminated_length": 263.25, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 7.353140592575073, "epoch": 0.899624445203141, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.275268982732733e-07, "loss": 0.0, "num_tokens": 2061358072.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1089.0, "completions/mean_length": 2026.314453125, "completions/mean_terminated_length": 461.857177734375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.299442172050476, "epoch": 0.899965858654831, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2734250165909624e-07, "loss": 0.0, "num_tokens": 2062478281.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 2019.166015625, "completions/mean_terminated_length": 202.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.351015329360962, "epoch": 0.900307272106521, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2715870538516713e-07, "loss": 0.0, "num_tokens": 2063593662.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1287.0, "completions/mean_length": 2035.17578125, "completions/mean_terminated_length": 406.5, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.272529721260071, "epoch": 0.900648685558211, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.269755097125492e-07, "loss": 0.0, "num_tokens": 2064714440.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 2044.76171875, "completions/mean_terminated_length": 390.0, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "entropy": 7.309353590011597, "epoch": 0.900990099009901, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2679291490145267e-07, "loss": 0.0, "num_tokens": 2065842894.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1981.142578125, "completions/mean_terminated_length": 1097.138916015625, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 6.998336672782898, "epoch": 0.9013315124615909, "frac_reward_zero_std": 0.96875, "grad_norm": 0.03571450601509734, "learning_rate": 1.2661092121123387e-07, "loss": 0.0013, "num_tokens": 2066939527.0, "reward": 0.0283203125, "reward_std": 0.013342384248971939, "rewards/accuracy_reward/mean": 0.004032257944345474, "rewards/accuracy_reward/std": 0.06343589723110199, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0244140625, "rewards/tag_count_reward/std": 0.09960611909627914, "step": 2640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 2033.701171875, "completions/mean_terminated_length": 217.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 7.315815806388855, "epoch": 0.901672925913281, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2642952890039577e-07, "loss": 0.0, "num_tokens": 2068056014.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1851.0, "completions/mean_length": 2032.71875, "completions/mean_terminated_length": 744.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 7.302455425262451, "epoch": 0.902014339364971, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.262487382265868e-07, "loss": 0.0, "num_tokens": 2069175550.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1742.0, "completions/mean_length": 1995.23046875, "completions/mean_terminated_length": 761.4285888671875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 7.12518584728241, "epoch": 0.902355752816661, "frac_reward_zero_std": 0.96875, "grad_norm": 0.05215056779041346, "learning_rate": 1.2606854944660113e-07, "loss": 0.0018, "num_tokens": 2070281332.0, "reward": 0.03662109375, "reward_std": 0.014745770022273064, "rewards/accuracy_reward/mean": 0.02217741869390011, "rewards/accuracy_reward/std": 0.14740893244743347, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01513671875, "rewards/tag_count_reward/std": 0.08503700792789459, "step": 2643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 2038.36328125, "completions/mean_terminated_length": 403.3333435058594, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.272305607795715, "epoch": 0.902697166268351, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2588896281637765e-07, "loss": 0.0, "num_tokens": 2071405950.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 2038.40234375, "completions/mean_terminated_length": 410.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.2804683446884155, "epoch": 0.903038579720041, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2570997859100044e-07, "loss": 0.0, "num_tokens": 2072523740.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1115.0, "completions/mean_length": 1995.34375, "completions/mean_terminated_length": 550.2222290039062, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 7.247391581535339, "epoch": 0.903379993171731, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2553159702469743e-07, "loss": 0.0, "num_tokens": 2073628732.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 2033.302734375, "completions/mean_terminated_length": 166.75, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 7.312941908836365, "epoch": 0.9037214066234209, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2535381837084092e-07, "loss": 0.0, "num_tokens": 2074746295.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.962890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2037.662109375, "completions/mean_terminated_length": 1769.4210205078125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.158963322639465, "epoch": 0.904062820075111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.251766428819465e-07, "loss": 0.0, "num_tokens": 2075871754.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1025.0, "completions/mean_length": 2026.947265625, "completions/mean_terminated_length": 508.14288330078125, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.3087111711502075, "epoch": 0.904404233526801, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2500007080967335e-07, "loss": 0.0, "num_tokens": 2076991695.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 2033.873046875, "completions/mean_terminated_length": 239.75, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 7.26897931098938, "epoch": 0.9047456469784909, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2482410240482323e-07, "loss": 0.0, "num_tokens": 2078106046.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 2034.36328125, "completions/mean_terminated_length": 884.3333740234375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 7.285163164138794, "epoch": 0.905087060430181, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2464873791734088e-07, "loss": 0.0, "num_tokens": 2079219032.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 2030.517578125, "completions/mean_terminated_length": 556.1666870117188, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.3218430280685425, "epoch": 0.905428473881871, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.244739775963128e-07, "loss": 0.0, "num_tokens": 2080336225.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1836.0, "completions/mean_length": 2038.3203125, "completions/mean_terminated_length": 809.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 7.319874048233032, "epoch": 0.9057698873335609, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.242998216899677e-07, "loss": 0.0, "num_tokens": 2081456549.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 2029.189453125, "completions/mean_terminated_length": 121.80000305175781, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.330453634262085, "epoch": 0.9061113007852509, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2412627044567542e-07, "loss": 0.0, "num_tokens": 2082562646.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1990.734375, "completions/mean_terminated_length": 582.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.219396471977234, "epoch": 0.906452714236941, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2395332410994732e-07, "loss": 0.0, "num_tokens": 2083662478.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 2040.521484375, "completions/mean_terminated_length": 133.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 7.239410877227783, "epoch": 0.9067941276886309, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.237809829284352e-07, "loss": 0.0, "num_tokens": 2084775081.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1740.0, "completions/mean_length": 2000.646484375, "completions/mean_terminated_length": 835.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 7.163971543312073, "epoch": 0.9071355411403209, "frac_reward_zero_std": 0.96875, "grad_norm": 0.02999753305026519, "learning_rate": 1.2360924714593165e-07, "loss": 0.0024, "num_tokens": 2085878644.0, "reward": 0.01513671875, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01513671875, "rewards/tag_count_reward/std": 0.08503700792789459, "step": 2657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1065.0, "completions/mean_length": 2036.23046875, "completions/mean_terminated_length": 541.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 7.305761337280273, "epoch": 0.9074769545920109, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2343811700636902e-07, "loss": 0.0, "num_tokens": 2086997674.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 2024.71484375, "completions/mean_terminated_length": 344.8571472167969, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 7.261038422584534, "epoch": 0.907818368043701, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2326759275281966e-07, "loss": 0.0, "num_tokens": 2088110808.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1745.0, "completions/mean_length": 2040.357421875, "completions/mean_terminated_length": 743.6666870117188, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 7.3133180141448975, "epoch": 0.9081597814953909, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2309767462749515e-07, "loss": 0.0, "num_tokens": 2089227071.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1.0, "completions/mean_length": 2044.001953125, "completions/mean_terminated_length": 1.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.341203689575195, "epoch": 0.9085011949470809, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2292836287174631e-07, "loss": 0.0, "num_tokens": 2090346192.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.951171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1229.0, "completions/mean_length": 1973.271484375, "completions/mean_terminated_length": 517.5599975585938, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.207287669181824, "epoch": 0.908842608398771, "frac_reward_zero_std": 0.96875, "grad_norm": 0.021058936673999485, "learning_rate": 1.2275965772606242e-07, "loss": 0.0024, "num_tokens": 2091435259.0, "reward": 0.01513671875, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01513671875, "rewards/tag_count_reward/std": 0.08503700792789459, "step": 2662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 2026.587890625, "completions/mean_terminated_length": 220.83334350585938, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.299233913421631, "epoch": 0.9091840218504609, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2259155943007143e-07, "loss": 0.0, "num_tokens": 2092552088.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 2040.0859375, "completions/mean_terminated_length": 697.3333740234375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 7.331140398979187, "epoch": 0.9095254353021509, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2242406822253908e-07, "loss": 0.0, "num_tokens": 2093668676.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 2024.5078125, "completions/mean_terminated_length": 544.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 7.331657409667969, "epoch": 0.9098668487538409, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.22257184341369e-07, "loss": 0.0, "num_tokens": 2094789064.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1036.0, "completions/mean_length": 2016.76953125, "completions/mean_terminated_length": 449.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.258445143699646, "epoch": 0.9102082622055309, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2209090802360198e-07, "loss": 0.0, "num_tokens": 2095896162.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1035.0, "completions/mean_length": 2026.453125, "completions/mean_terminated_length": 209.33334350585938, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.2946577072143555, "epoch": 0.9105496756572209, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.21925239505416e-07, "loss": 0.0, "num_tokens": 2097006186.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 2038.927734375, "completions/mean_terminated_length": 499.66668701171875, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.235970616340637, "epoch": 0.9108910891089109, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.217601790221257e-07, "loss": 0.0, "num_tokens": 2098132805.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 2040.189453125, "completions/mean_terminated_length": 48.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 7.279875040054321, "epoch": 0.9112325025606008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2159572680818183e-07, "loss": 0.0, "num_tokens": 2099267142.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.962890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1997.654296875, "completions/mean_terminated_length": 691.3157958984375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.257268667221069, "epoch": 0.9115739160122909, "frac_reward_zero_std": 0.96875, "grad_norm": 0.4235336527563951, "learning_rate": 1.214318830971716e-07, "loss": 0.006, "num_tokens": 2100368645.0, "reward": 0.01513671875, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01513671875, "rewards/tag_count_reward/std": 0.08503700792789459, "step": 2670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 113.0, "completions/mean_length": 2044.220703125, "completions/mean_terminated_length": 113.0, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 7.292963743209839, "epoch": 0.9119153294639809, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2126864812181754e-07, "loss": 0.0, "num_tokens": 2101490566.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1089.0, "completions/mean_length": 2032.365234375, "completions/mean_terminated_length": 447.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.333116173744202, "epoch": 0.9122567429156708, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2110602211397773e-07, "loss": 0.0, "num_tokens": 2102605233.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 2025.330078125, "completions/mean_terminated_length": 113.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.246962904930115, "epoch": 0.9125981563673609, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2094400530464508e-07, "loss": 0.0, "num_tokens": 2103716762.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 2007.5234375, "completions/mean_terminated_length": 1061.142822265625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 7.094107031822205, "epoch": 0.9129395698190509, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2078259792394745e-07, "loss": 0.0, "num_tokens": 2104827302.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1134.0, "completions/mean_length": 2031.341796875, "completions/mean_terminated_length": 342.20001220703125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 7.297768831253052, "epoch": 0.9132809832707409, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2062180020114684e-07, "loss": 0.0, "num_tokens": 2105949205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 2035.201171875, "completions/mean_terminated_length": 409.75, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 7.244718790054321, "epoch": 0.9136223967224308, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.204616123646394e-07, "loss": 0.0, "num_tokens": 2107069276.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 2000.580078125, "completions/mean_terminated_length": 530.5625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 7.198700904846191, "epoch": 0.9139638101741209, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.203020346419551e-07, "loss": 0.0, "num_tokens": 2108180181.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1151.0, "completions/mean_length": 2039.64453125, "completions/mean_terminated_length": 622.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 7.308897852897644, "epoch": 0.9143052236258109, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2014306725975718e-07, "loss": 0.0, "num_tokens": 2109306191.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 2039.427734375, "completions/mean_terminated_length": 950.75, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 7.271180987358093, "epoch": 0.9146466370775008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1998471044384193e-07, "loss": 0.0, "num_tokens": 2110429610.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 2031.498046875, "completions/mean_terminated_length": 358.20001220703125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.250616550445557, "epoch": 0.9149880505291909, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1982696441913848e-07, "loss": 0.0, "num_tokens": 2111552025.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2042.291015625, "completions/mean_terminated_length": 1885.611083984375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 7.170033574104309, "epoch": 0.9153294639808809, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1966982940970833e-07, "loss": 0.0, "num_tokens": 2112687038.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1156.0, "completions/mean_length": 2026.498046875, "completions/mean_terminated_length": 475.2857360839844, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 7.316515326499939, "epoch": 0.9156708774325708, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1951330563874515e-07, "loss": 0.0, "num_tokens": 2113804797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1834.0, "completions/mean_length": 2036.78515625, "completions/mean_terminated_length": 899.6000366210938, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 7.347719430923462, "epoch": 0.9160122908842608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1935739332857443e-07, "loss": 0.0, "num_tokens": 2114919183.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1064.0, "completions/mean_length": 2027.654296875, "completions/mean_terminated_length": 311.8333435058594, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 7.291154265403748, "epoch": 0.9163537043359509, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.192020927006531e-07, "loss": 0.0, "num_tokens": 2116031694.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 2032.916015625, "completions/mean_terminated_length": 117.25, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 7.260668039321899, "epoch": 0.9166951177876408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1904740397556923e-07, "loss": 0.0, "num_tokens": 2117145315.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 2027.962890625, "completions/mean_terminated_length": 338.16668701171875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.3337721824646, "epoch": 0.9170365312393308, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1889332737304179e-07, "loss": 0.0, "num_tokens": 2118254416.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1184.0, "completions/mean_length": 2028.689453125, "completions/mean_terminated_length": 635.5714721679688, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 7.298390030860901, "epoch": 0.9173779446910209, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.187398631119203e-07, "loss": 0.0, "num_tokens": 2119359633.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "entropy": 7.2158203125, "epoch": 0.9177193581427108, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1858701141018451e-07, "loss": 0.0, "num_tokens": 2120481217.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 2040.78125, "completions/mean_terminated_length": 200.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 7.240722298622131, "epoch": 0.9180607715944008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1843477248494401e-07, "loss": 0.0, "num_tokens": 2121595633.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1089.0, "completions/mean_length": 2034.130859375, "completions/mean_terminated_length": 627.7999877929688, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 7.329877614974976, "epoch": 0.9184021850460908, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1828314655243826e-07, "loss": 0.0, "num_tokens": 2122721124.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 2032.189453125, "completions/mean_terminated_length": 24.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.299860000610352, "epoch": 0.9187435984977809, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1813213382803569e-07, "loss": 0.0, "num_tokens": 2123832949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 2037.90234375, "completions/mean_terminated_length": 324.66668701171875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 7.3011075258255005, "epoch": 0.9190850119494708, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1798173452623397e-07, "loss": 0.0, "num_tokens": 2124947027.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 2040.0859375, "completions/mean_terminated_length": 22.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 7.260020732879639, "epoch": 0.9194264254011608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1783194886065931e-07, "loss": 0.0, "num_tokens": 2126072863.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 1995.275390625, "completions/mean_terminated_length": 548.2777709960938, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.240965723991394, "epoch": 0.9197678388528508, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1768277704406647e-07, "loss": 0.0, "num_tokens": 2127175612.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 2024.46875, "completions/mean_terminated_length": 843.2000122070312, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 7.342366933822632, "epoch": 0.9201092523045408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1753421928833825e-07, "loss": 0.0, "num_tokens": 2128281180.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1100.0, "completions/mean_length": 2046.1484375, "completions/mean_terminated_length": 1100.0, "completions/min_length": 1100.0, "completions/min_terminated_length": 1100.0, "entropy": 7.300578832626343, "epoch": 0.9204506657562308, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1738627580448519e-07, "loss": 0.0, "num_tokens": 2129410344.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1272.0, "completions/mean_length": 1987.802734375, "completions/mean_terminated_length": 647.0454711914062, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.059694766998291, "epoch": 0.9207920792079208, "frac_reward_zero_std": 0.96875, "grad_norm": 0.03407132024276849, "learning_rate": 1.1723894680264526e-07, "loss": -0.0025, "num_tokens": 2130509907.0, "reward": 0.03857421875, "reward_std": 0.013819883577525616, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15143637359142303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01513671875, "rewards/tag_count_reward/std": 0.08503700792789459, "step": 2697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1072.0, "completions/mean_length": 2033.74609375, "completions/mean_terminated_length": 588.4000244140625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 7.327404499053955, "epoch": 0.9211334926596108, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.170922324920839e-07, "loss": 0.0, "num_tokens": 2131624545.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 2036.1875, "completions/mean_terminated_length": 32.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.236794829368591, "epoch": 0.9214749061113008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1694613308119312e-07, "loss": 0.0, "num_tokens": 2132750977.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 2038.021484375, "completions/mean_terminated_length": 345.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 7.286463975906372, "epoch": 0.9218163195629908, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1680064877749168e-07, "loss": 0.0, "num_tokens": 2133874652.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 2037.595703125, "completions/mean_terminated_length": 272.3333435058594, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 7.259472012519836, "epoch": 0.9221577330146807, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1665577978762473e-07, "loss": 0.0, "num_tokens": 2134991213.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 2034.365234375, "completions/mean_terminated_length": 302.75, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 7.275458931922913, "epoch": 0.9224991464663708, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.165115263173633e-07, "loss": 0.0, "num_tokens": 2136106568.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 2032.4453125, "completions/mean_terminated_length": 720.6666870117188, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 7.266222238540649, "epoch": 0.9228405599180608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1636788857160406e-07, "loss": 0.0, "num_tokens": 2137224844.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1734.0, "completions/mean_length": 2038.5078125, "completions/mean_terminated_length": 833.0, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 7.271870970726013, "epoch": 0.9231819733697507, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1622486675436945e-07, "loss": 0.0, "num_tokens": 2138350688.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.966796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1997.583984375, "completions/mean_terminated_length": 529.5882568359375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 7.238677263259888, "epoch": 0.9235233868214408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1608246106880662e-07, "loss": 0.0, "num_tokens": 2139453051.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "entropy": 7.3818359375, "epoch": 0.9238648002731308, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1594067171718788e-07, "loss": 0.0, "num_tokens": 2140579019.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1146.0, "completions/mean_length": 2021.39453125, "completions/mean_terminated_length": 345.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.203421235084534, "epoch": 0.9242062137248208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1579949890090993e-07, "loss": 0.0, "num_tokens": 2141696805.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 2023.232421875, "completions/mean_terminated_length": 236.4285888671875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 7.280059337615967, "epoch": 0.9245476271765107, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.15658942820494e-07, "loss": 0.0, "num_tokens": 2142805884.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1909.0, "completions/mean_length": 2021.75390625, "completions/mean_terminated_length": 368.25, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.2829906940460205, "epoch": 0.9248890406282008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1551900367558484e-07, "loss": 0.0, "num_tokens": 2143923710.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.966796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 2003.453125, "completions/mean_terminated_length": 706.3529663085938, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 7.246068120002747, "epoch": 0.9252304540798908, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1537968166495146e-07, "loss": 0.0, "num_tokens": 2145030886.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.962890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1996.6328125, "completions/mean_terminated_length": 663.7894897460938, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 7.19542121887207, "epoch": 0.9255718675315807, "frac_reward_zero_std": 0.96875, "grad_norm": 0.05931471990966025, "learning_rate": 1.1524097698648583e-07, "loss": 0.0041, "num_tokens": 2146138202.0, "reward": 0.013671875, "reward_std": 0.0034938561730086803, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.013671875, "rewards/tag_count_reward/std": 0.07856711745262146, "step": 2711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1093.0, "completions/mean_length": 2019.658203125, "completions/mean_terminated_length": 435.6666564941406, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 7.319682359695435, "epoch": 0.9259132809832707, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1510288983720338e-07, "loss": 0.0, "num_tokens": 2147247419.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1721.0, "completions/mean_length": 2041.0859375, "completions/mean_terminated_length": 1163.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.321959853172302, "epoch": 0.9262546944349608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1496542041324228e-07, "loss": 0.0, "num_tokens": 2148368327.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 2029.978515625, "completions/mean_terminated_length": 510.16668701171875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.265071988105774, "epoch": 0.9265961078866507, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1482856890986333e-07, "loss": 0.0, "num_tokens": 2149482604.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 2040.537109375, "completions/mean_terminated_length": 137.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 7.306133031845093, "epoch": 0.9269375213383407, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1469233552144955e-07, "loss": 0.0, "num_tokens": 2150603199.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 2028.716796875, "completions/mean_terminated_length": 637.5714721679688, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.246744871139526, "epoch": 0.9272789347900308, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1455672044150609e-07, "loss": 0.0, "num_tokens": 2151721742.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 2040.0234375, "completions/mean_terminated_length": 6.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.293856382369995, "epoch": 0.9276203482417207, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1442172386265972e-07, "loss": 0.0, "num_tokens": 2152837802.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1255.0, "completions/mean_length": 1998.34375, "completions/mean_terminated_length": 776.7999877929688, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 7.165678143501282, "epoch": 0.9279617616934107, "frac_reward_zero_std": 0.96875, "grad_norm": 0.04663968775031813, "learning_rate": 1.142873459766589e-07, "loss": 0.0005, "num_tokens": 2153946362.0, "reward": 0.02734375, "reward_std": 0.015625, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.10772226005792618, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1044.0, "completions/mean_length": 2046.0390625, "completions/mean_terminated_length": 1044.0, "completions/min_length": 1044.0, "completions/min_terminated_length": 1044.0, "entropy": 7.199297070503235, "epoch": 0.9283031751451007, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1415358697437315e-07, "loss": 0.0, "num_tokens": 2155071982.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1832.0, "completions/mean_length": 2039.626953125, "completions/mean_terminated_length": 619.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 7.320500135421753, "epoch": 0.9286445885967907, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1402044704579305e-07, "loss": 0.0, "num_tokens": 2156187599.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1952.0, "completions/mean_length": 2032.833984375, "completions/mean_terminated_length": 495.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.31626284122467, "epoch": 0.9289860020484807, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1388792638002969e-07, "loss": 0.0, "num_tokens": 2157306554.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1854.0, "completions/mean_length": 2035.51953125, "completions/mean_terminated_length": 770.0, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 7.308812260627747, "epoch": 0.9293274155001707, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1375602516531472e-07, "loss": 0.0, "num_tokens": 2158426436.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1763.0, "completions/mean_length": 2036.056640625, "completions/mean_terminated_length": 519.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.250640630722046, "epoch": 0.9296688289518606, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.136247435889998e-07, "loss": 0.0, "num_tokens": 2159547025.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1164.0, "completions/mean_length": 2046.2734375, "completions/mean_terminated_length": 1164.0, "completions/min_length": 1164.0, "completions/min_terminated_length": 1164.0, "entropy": 7.247357130050659, "epoch": 0.9300102424035507, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.134940818375565e-07, "loss": 0.0, "num_tokens": 2160670109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2036.484375, "completions/mean_terminated_length": 1753.2000732421875, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "entropy": 7.185216426849365, "epoch": 0.9303516558552407, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1336404009657603e-07, "loss": 0.0, "num_tokens": 2161800021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1054.0, "completions/mean_length": 2038.228515625, "completions/mean_terminated_length": 380.3333435058594, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 7.286699414253235, "epoch": 0.9306930693069307, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1323461855076901e-07, "loss": 0.0, "num_tokens": 2162929658.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 2022.66015625, "completions/mean_terminated_length": 194.57144165039062, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.349892020225525, "epoch": 0.9310344827586207, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1310581738396499e-07, "loss": 0.0, "num_tokens": 2164046076.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1968.0, "completions/mean_length": 2026.08203125, "completions/mean_terminated_length": 645.25, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 7.299405455589294, "epoch": 0.9313758962103107, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1297763677911238e-07, "loss": 0.0, "num_tokens": 2165161494.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1052.0, "completions/mean_length": 2043.369140625, "completions/mean_terminated_length": 862.5, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "entropy": 7.347915172576904, "epoch": 0.9317173096620007, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.128500769182781e-07, "loss": 0.0, "num_tokens": 2166288179.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1896.0, "completions/mean_length": 2031.447265625, "completions/mean_terminated_length": 837.2857666015625, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "entropy": 7.381665468215942, "epoch": 0.9320587231136906, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1272313798264753e-07, "loss": 0.0, "num_tokens": 2167404648.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1120.0, "completions/mean_length": 2042.265625, "completions/mean_terminated_length": 580.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 7.304984450340271, "epoch": 0.9324001365653807, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1259682015252397e-07, "loss": 0.0, "num_tokens": 2168525744.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 2034.27734375, "completions/mean_terminated_length": 291.5, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 7.298550605773926, "epoch": 0.9327415500170707, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1247112360732859e-07, "loss": 0.0, "num_tokens": 2169643246.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 2034.703125, "completions/mean_terminated_length": 346.0, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 7.289757132530212, "epoch": 0.9330829634687606, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1234604852559989e-07, "loss": 0.0, "num_tokens": 2170756086.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "entropy": 7.2880859375, "epoch": 0.9334243769204507, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.122215950849939e-07, "loss": 0.0, "num_tokens": 2171888918.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 2020.939453125, "completions/mean_terminated_length": 68.71428680419922, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.283817529678345, "epoch": 0.9337657903721407, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1209776346228351e-07, "loss": 0.0, "num_tokens": 2172997975.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 2040.044921875, "completions/mean_terminated_length": 11.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.242475271224976, "epoch": 0.9341072038238306, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1197455383335848e-07, "loss": 0.0, "num_tokens": 2174116878.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 2038.671875, "completions/mean_terminated_length": 456.0, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 7.304771661758423, "epoch": 0.9344486172755206, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.11851966373225e-07, "loss": 0.0, "num_tokens": 2175235062.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1816.0, "completions/mean_length": 1992.841796875, "completions/mean_terminated_length": 764.3181762695312, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.197152018547058, "epoch": 0.9347900307272107, "frac_reward_zero_std": 0.96875, "grad_norm": 0.07851532615564896, "learning_rate": 1.1173000125600561e-07, "loss": -0.0026, "num_tokens": 2176337829.0, "reward": 0.01953125, "reward_std": 0.010673906654119492, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.06243881583213806, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 2019.9296875, "completions/mean_terminated_length": 251.5, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 7.2935627698898315, "epoch": 0.9351314441789006, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1160865865493885e-07, "loss": 0.0, "num_tokens": 2177446321.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 2037.970703125, "completions/mean_terminated_length": 336.3333435058594, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.266896843910217, "epoch": 0.9354728576305906, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1148793874237905e-07, "loss": 0.0, "num_tokens": 2178571330.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 2039.18359375, "completions/mean_terminated_length": 919.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 7.304916024208069, "epoch": 0.9358142710822807, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1136784168979604e-07, "loss": 0.0, "num_tokens": 2179687936.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 2032.673828125, "completions/mean_terminated_length": 86.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 7.273541688919067, "epoch": 0.9361556845339707, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1124836766777502e-07, "loss": 0.0, "num_tokens": 2180806169.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 2038.447265625, "completions/mean_terminated_length": 417.66668701171875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.250392436981201, "epoch": 0.9364970979856606, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1112951684601616e-07, "loss": 0.0, "num_tokens": 2181931438.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1351.0, "completions/mean_length": 2038.693359375, "completions/mean_terminated_length": 856.75, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "entropy": 7.325593709945679, "epoch": 0.9368385114373506, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1101128939333448e-07, "loss": 0.0, "num_tokens": 2183057857.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 665.0, "completions/mean_length": 2026.89453125, "completions/mean_terminated_length": 247.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.366289496421814, "epoch": 0.9371799248890407, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1089368547765956e-07, "loss": 0.0, "num_tokens": 2184168379.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1930.0, "completions/mean_length": 2043.369140625, "completions/mean_terminated_length": 1257.666748046875, "completions/min_length": 668.0, "completions/min_terminated_length": 668.0, "entropy": 7.236325144767761, "epoch": 0.9375213383407306, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1077670526603537e-07, "loss": 0.0, "num_tokens": 2185307608.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.955078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2022.779296875, "completions/mean_terminated_length": 1486.565185546875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.155123472213745, "epoch": 0.9378627517924206, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1066034892461983e-07, "loss": 0.0, "num_tokens": 2186426039.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1637.0, "completions/mean_length": 2038.728515625, "completions/mean_terminated_length": 861.25, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "entropy": 7.328237771987915, "epoch": 0.9382041652441107, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.105446166186849e-07, "loss": 0.0, "num_tokens": 2187540060.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "entropy": 7.2431640625, "epoch": 0.9385455786958006, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1042950851261594e-07, "loss": 0.0, "num_tokens": 2188663356.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 2026.40234375, "completions/mean_terminated_length": 205.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.3204803466796875, "epoch": 0.9388869921474906, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1031502476991205e-07, "loss": 0.0, "num_tokens": 2189777962.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 2042.14453125, "completions/mean_terminated_length": 549.0, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 7.286734342575073, "epoch": 0.9392284055991806, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1020116555318504e-07, "loss": 0.0, "num_tokens": 2190903300.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1057.0, "completions/mean_length": 2035.38671875, "completions/mean_terminated_length": 433.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.317111015319824, "epoch": 0.9395698190508706, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1008793102416005e-07, "loss": 0.0, "num_tokens": 2192035226.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.962890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1358.0, "completions/mean_length": 2000.58203125, "completions/mean_terminated_length": 770.2105102539062, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 7.144457101821899, "epoch": 0.9399112325025606, "frac_reward_zero_std": 0.96875, "grad_norm": 0.02569413630734095, "learning_rate": 1.0997532134367466e-07, "loss": -0.0004, "num_tokens": 2193140756.0, "reward": 0.03125, "reward_std": 0.01613743044435978, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12414088100194931, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1159.0, "completions/mean_length": 2033.029296875, "completions/mean_terminated_length": 515.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 7.296233177185059, "epoch": 0.9402526459542506, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0986333667167905e-07, "loss": 0.0, "num_tokens": 2194257907.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 2040.404296875, "completions/mean_terminated_length": 103.5, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 7.25298798084259, "epoch": 0.9405940594059405, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0975197716723546e-07, "loss": 0.0, "num_tokens": 2195380866.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1993.390625, "completions/mean_terminated_length": 650.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.225358366966248, "epoch": 0.9409354728576306, "frac_reward_zero_std": 0.96875, "grad_norm": 0.04436258346330781, "learning_rate": 1.0964124298851851e-07, "loss": 0.0009, "num_tokens": 2196480650.0, "reward": 0.01708984375, "reward_std": 0.0081782853230834, "rewards/accuracy_reward/mean": 0.001953125, "rewards/accuracy_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01513671875, "rewards/tag_count_reward/std": 0.08503700792789459, "step": 2756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1776.0, "completions/mean_length": 2033.537109375, "completions/mean_terminated_length": 567.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.249841570854187, "epoch": 0.9412768863093206, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0953113429281422e-07, "loss": 0.0, "num_tokens": 2197603133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 2040.197265625, "completions/mean_terminated_length": 50.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 7.325751304626465, "epoch": 0.9416182997610106, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0942165123652037e-07, "loss": 0.0, "num_tokens": 2198727138.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1996.208984375, "completions/mean_terminated_length": 574.8333129882812, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 7.130482316017151, "epoch": 0.9419597132127006, "frac_reward_zero_std": 0.96875, "grad_norm": 0.03406719188961604, "learning_rate": 1.0931279397514603e-07, "loss": 0.0001, "num_tokens": 2199842141.0, "reward": 0.01513671875, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01513671875, "rewards/tag_count_reward/std": 0.08503700792789459, "step": 2759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 2024.892578125, "completions/mean_terminated_length": 76.16667175292969, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.307883977890015, "epoch": 0.9423011266643906, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0920456266331154e-07, "loss": 0.0, "num_tokens": 2200955238.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 2036.171875, "completions/mean_terminated_length": 29.33333396911621, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.299363374710083, "epoch": 0.9426425401160806, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0909695745474783e-07, "loss": 0.0, "num_tokens": 2202087070.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 2018.96484375, "completions/mean_terminated_length": 189.75, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.357603549957275, "epoch": 0.9429839535677705, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0898997850229693e-07, "loss": 0.0, "num_tokens": 2203195196.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 2036.177734375, "completions/mean_terminated_length": 30.33333396911621, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 7.260913968086243, "epoch": 0.9433253670194606, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0888362595791095e-07, "loss": 0.0, "num_tokens": 2204316215.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1235.0, "completions/mean_length": 2027.833984375, "completions/mean_terminated_length": 573.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 7.304510831832886, "epoch": 0.9436667804711506, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0877789997265255e-07, "loss": 0.0, "num_tokens": 2205430562.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1181.0, "completions/mean_length": 2042.349609375, "completions/mean_terminated_length": 601.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 7.272239089012146, "epoch": 0.9440081939228405, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0867280069669415e-07, "loss": 0.0, "num_tokens": 2206549829.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1359.0, "completions/mean_length": 2031.208984375, "completions/mean_terminated_length": 615.1666870117188, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.315643310546875, "epoch": 0.9443496073745306, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0856832827931831e-07, "loss": 0.0, "num_tokens": 2207661184.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 2044.654296875, "completions/mean_terminated_length": 335.0, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "entropy": 7.241289138793945, "epoch": 0.9446910208262206, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0846448286891689e-07, "loss": 0.0, "num_tokens": 2208778607.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1224.0, "completions/mean_length": 2046.390625, "completions/mean_terminated_length": 1224.0, "completions/min_length": 1224.0, "completions/min_terminated_length": 1224.0, "entropy": 7.309555172920227, "epoch": 0.9450324342779105, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0836126461299149e-07, "loss": 0.0, "num_tokens": 2209904359.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1147.0, "completions/mean_length": 2026.779296875, "completions/mean_terminated_length": 495.857177734375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 7.350939512252808, "epoch": 0.9453738477296005, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0825867365815255e-07, "loss": 0.0, "num_tokens": 2211014086.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 2036.03125, "completions/mean_terminated_length": 516.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.2472755908966064, "epoch": 0.9457152611812906, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.081567101501198e-07, "loss": 0.0, "num_tokens": 2212142022.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 2036.22265625, "completions/mean_terminated_length": 38.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 7.3334585428237915, "epoch": 0.9460566746329805, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0805537423372147e-07, "loss": 0.0, "num_tokens": 2213262632.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 2038.353515625, "completions/mean_terminated_length": 401.66668701171875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.276898384094238, "epoch": 0.9463980880846705, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0795466605289464e-07, "loss": 0.0, "num_tokens": 2214396141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 2034.6328125, "completions/mean_terminated_length": 337.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 7.255436182022095, "epoch": 0.9467395015363606, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0785458575068455e-07, "loss": 0.0, "num_tokens": 2215515649.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.962890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1504.0, "completions/mean_length": 2003.662109375, "completions/mean_terminated_length": 853.2105102539062, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 7.199223756790161, "epoch": 0.9470809149880506, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0775513346924465e-07, "loss": 0.0, "num_tokens": 2216618708.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1784.0, "completions/mean_length": 2034.361328125, "completions/mean_terminated_length": 651.4000244140625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 7.323652863502502, "epoch": 0.9474223284397405, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0765630934983644e-07, "loss": 0.0, "num_tokens": 2217731613.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1281.0, "completions/mean_length": 2031.4765625, "completions/mean_terminated_length": 638.0, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 7.247342586517334, "epoch": 0.9477637418914305, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0755811353282915e-07, "loss": 0.0, "num_tokens": 2218847457.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 2033.1171875, "completions/mean_terminated_length": 143.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.291309118270874, "epoch": 0.9481051553431206, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0746054615769942e-07, "loss": 0.0, "num_tokens": 2219970829.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 2036.7265625, "completions/mean_terminated_length": 124.0, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 7.288694143295288, "epoch": 0.9484465687948105, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0736360736303154e-07, "loss": 0.0, "num_tokens": 2221093521.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 2036.333984375, "completions/mean_terminated_length": 57.0, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 7.180588126182556, "epoch": 0.9487879822465005, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0726729728651671e-07, "loss": 0.0, "num_tokens": 2222222540.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 2041.595703125, "completions/mean_terminated_length": 408.5, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "entropy": 7.3536765575408936, "epoch": 0.9491293956981905, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0717161606495317e-07, "loss": 0.0, "num_tokens": 2223348349.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 2044.33203125, "completions/mean_terminated_length": 170.0, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 7.243589878082275, "epoch": 0.9494708091498805, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0707656383424609e-07, "loss": 0.0, "num_tokens": 2224478807.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 2037.486328125, "completions/mean_terminated_length": 253.6666717529297, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 7.308361053466797, "epoch": 0.9498122226015705, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0698214072940701e-07, "loss": 0.0, "num_tokens": 2225595648.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 2040.05859375, "completions/mean_terminated_length": 15.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 7.273409843444824, "epoch": 0.9501536360532605, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0688834688455399e-07, "loss": 0.0, "num_tokens": 2226710990.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1841.0, "completions/mean_length": 2039.2734375, "completions/mean_terminated_length": 1154.4000244140625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 7.286203980445862, "epoch": 0.9504950495049505, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0679518243291127e-07, "loss": 0.0, "num_tokens": 2227824330.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1693.0, "completions/mean_length": 2034.37890625, "completions/mean_terminated_length": 653.2000122070312, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.179887413978577, "epoch": 0.9508364629566405, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0670264750680906e-07, "loss": 0.0, "num_tokens": 2228952076.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1075.0, "completions/mean_length": 2024.59765625, "completions/mean_terminated_length": 336.2857360839844, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.312090516090393, "epoch": 0.9511778764083305, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0661074223768346e-07, "loss": 0.0, "num_tokens": 2230065310.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 2033.55078125, "completions/mean_terminated_length": 198.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 7.34731650352478, "epoch": 0.9515192898600204, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0651946675607618e-07, "loss": 0.0, "num_tokens": 2231184552.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 2041.248046875, "completions/mean_terminated_length": 319.5, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 7.311315298080444, "epoch": 0.9518607033117105, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.064288211916344e-07, "loss": 0.0, "num_tokens": 2232309543.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 2040.03125, "completions/mean_terminated_length": 8.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.240180134773254, "epoch": 0.9522021167634005, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0633880567311049e-07, "loss": 0.0, "num_tokens": 2233432903.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 2034.275390625, "completions/mean_terminated_length": 291.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.320512056350708, "epoch": 0.9525435302150905, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0624942032836199e-07, "loss": 0.0, "num_tokens": 2234560996.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 2041.013671875, "completions/mean_terminated_length": 259.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 7.302452206611633, "epoch": 0.9528849436667804, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0616066528435134e-07, "loss": 0.0, "num_tokens": 2235687259.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 2036.708984375, "completions/mean_terminated_length": 121.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 7.229143023490906, "epoch": 0.9532263571184705, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0607254066714566e-07, "loss": 0.0, "num_tokens": 2236808550.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1111.0, "completions/mean_length": 2024.595703125, "completions/mean_terminated_length": 336.14288330078125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.232854127883911, "epoch": 0.9535677705701605, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0598504660191671e-07, "loss": 0.0, "num_tokens": 2237922903.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 2036.880859375, "completions/mean_terminated_length": 150.33334350585938, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 7.31508207321167, "epoch": 0.9539091840218504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0589818321294048e-07, "loss": 0.0, "num_tokens": 2239044954.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1265.0, "completions/mean_length": 2039.306640625, "completions/mean_terminated_length": 564.3333740234375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 7.333272933959961, "epoch": 0.9542505974735405, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.058119506235973e-07, "loss": 0.0, "num_tokens": 2240167175.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 2044.078125, "completions/mean_terminated_length": 40.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 7.34885835647583, "epoch": 0.9545920109252305, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0572634895637133e-07, "loss": 0.0, "num_tokens": 2241286447.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 1987.619140625, "completions/mean_terminated_length": 502.25, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 7.228660345077515, "epoch": 0.9549334243769204, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0564137833285074e-07, "loss": 0.0, "num_tokens": 2242389692.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 2037.421875, "completions/mean_terminated_length": 242.6666717529297, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 7.36349630355835, "epoch": 0.9552748378286104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0555703887372733e-07, "loss": 0.0, "num_tokens": 2243505172.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1177.0, "completions/mean_length": 2038.83203125, "completions/mean_terminated_length": 483.3333435058594, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 7.2824387550354, "epoch": 0.9556162512803005, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0547333069879628e-07, "loss": 0.0, "num_tokens": 2244624366.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1068.0, "completions/mean_length": 2032.486328125, "completions/mean_terminated_length": 724.1666870117188, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "entropy": 7.2856022119522095, "epoch": 0.9559576647319904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.053902539269563e-07, "loss": 0.0, "num_tokens": 2245738727.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 2034.869140625, "completions/mean_terminated_length": 703.4000244140625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 7.31130838394165, "epoch": 0.9562990781836804, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0530780867620914e-07, "loss": 0.0, "num_tokens": 2246863060.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1769.0, "completions/mean_length": 2039.39453125, "completions/mean_terminated_length": 946.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.231595396995544, "epoch": 0.9566404916353705, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0522599506365953e-07, "loss": 0.0, "num_tokens": 2247984206.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 2037.8515625, "completions/mean_terminated_length": 749.0, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "entropy": 7.33868670463562, "epoch": 0.9569819050870604, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0514481320551505e-07, "loss": 0.0, "num_tokens": 2249105026.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1438.0, "completions/mean_length": 2036.587890625, "completions/mean_terminated_length": 587.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.341476798057556, "epoch": 0.9573233185387504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0506426321708588e-07, "loss": 0.0, "num_tokens": 2250226895.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 2029.5859375, "completions/mean_terminated_length": 476.66668701171875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 7.371154546737671, "epoch": 0.9576647319904404, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0498434521278483e-07, "loss": 0.0, "num_tokens": 2251342587.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 2034.884765625, "completions/mean_terminated_length": 369.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.288599729537964, "epoch": 0.9580061454421305, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0490505930612697e-07, "loss": 0.0, "num_tokens": 2252463984.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2044.0078125, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.269175887107849, "epoch": 0.9583475588938204, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0482640560972955e-07, "loss": 0.0, "num_tokens": 2253588612.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 2035.71875, "completions/mean_terminated_length": 476.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.253320217132568, "epoch": 0.9586889723455104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0474838423531176e-07, "loss": 0.0, "num_tokens": 2254711780.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 2024.181640625, "completions/mean_terminated_length": 305.8571472167969, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.287395119667053, "epoch": 0.9590303857972005, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0467099529369473e-07, "loss": 0.0, "num_tokens": 2255825921.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "entropy": 7.265625, "epoch": 0.9593717992488904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0459423889480126e-07, "loss": 0.0, "num_tokens": 2256953089.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.962890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2036.9140625, "completions/mean_terminated_length": 1749.26318359375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 7.135571241378784, "epoch": 0.9597132127005804, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0451811514765569e-07, "loss": 0.0, "num_tokens": 2258084133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 2042.94140625, "completions/mean_terminated_length": 753.0, "completions/min_length": 657.0, "completions/min_terminated_length": 657.0, "entropy": 7.251187801361084, "epoch": 0.9600546261522704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0444262416038376e-07, "loss": 0.0, "num_tokens": 2259210375.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.966796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1825.0, "completions/mean_length": 2003.0078125, "completions/mean_terminated_length": 692.941162109375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 7.214335203170776, "epoch": 0.9603960396039604, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0436776604021244e-07, "loss": 0.0, "num_tokens": 2260321563.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 2029.525390625, "completions/mean_terminated_length": 156.1999969482422, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.249168872833252, "epoch": 0.9607374530556504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0429354089346976e-07, "loss": 0.0, "num_tokens": 2261436504.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 2040.759765625, "completions/mean_terminated_length": 194.5, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 7.29896342754364, "epoch": 0.9610788665073404, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0421994882558466e-07, "loss": 0.0, "num_tokens": 2262561341.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 2026.564453125, "completions/mean_terminated_length": 218.83334350585938, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.289191961288452, "epoch": 0.9614202799590303, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0414698994108689e-07, "loss": 0.0, "num_tokens": 2263678798.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 2041.677734375, "completions/mean_terminated_length": 429.5, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 7.301053762435913, "epoch": 0.9617616934107204, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0407466434360689e-07, "loss": 0.0, "num_tokens": 2264797017.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1094.0, "completions/mean_length": 2034.29296875, "completions/mean_terminated_length": 293.5, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.26755690574646, "epoch": 0.9621031068624104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0400297213587539e-07, "loss": 0.0, "num_tokens": 2265912111.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1487.0, "completions/mean_length": 2029.126953125, "completions/mean_terminated_length": 437.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 7.244848608970642, "epoch": 0.9624445203141003, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0393191341972373e-07, "loss": 0.0, "num_tokens": 2267023072.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 2030.37890625, "completions/mean_terminated_length": 243.60000610351562, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.2744786739349365, "epoch": 0.9627859337657904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0386148829608319e-07, "loss": 0.0, "num_tokens": 2268132882.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 2031.97265625, "completions/mean_terminated_length": 406.8000183105469, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 7.286127328872681, "epoch": 0.9631273472174804, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0379169686498522e-07, "loss": 0.0, "num_tokens": 2269246084.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1261.0, "completions/mean_length": 2037.08203125, "completions/mean_terminated_length": 650.5, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 7.30204176902771, "epoch": 0.9634687606691703, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0372253922556121e-07, "loss": 0.0, "num_tokens": 2270373726.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1124.0, "completions/mean_length": 2027.55859375, "completions/mean_terminated_length": 303.66668701171875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.265119671821594, "epoch": 0.9638101741208603, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0365401547604226e-07, "loss": 0.0, "num_tokens": 2271485884.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.966796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1237.0, "completions/mean_length": 2011.0859375, "completions/mean_terminated_length": 936.2352905273438, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "entropy": 7.171500325202942, "epoch": 0.9641515875725504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0358612571375903e-07, "loss": 0.0, "num_tokens": 2272600424.0, "reward": 0.046875, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 2036.833984375, "completions/mean_terminated_length": 142.33334350585938, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.2392518520355225, "epoch": 0.9644930010242404, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0351887003514184e-07, "loss": 0.0, "num_tokens": 2273724435.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1103.0, "completions/mean_length": 2039.251953125, "completions/mean_terminated_length": 555.0, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 7.26644492149353, "epoch": 0.9648344144759303, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0345224853572018e-07, "loss": 0.0, "num_tokens": 2274850052.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1288.0, "completions/mean_length": 2040.958984375, "completions/mean_terminated_length": 846.3333740234375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 7.348689675331116, "epoch": 0.9651758279276204, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0338626131012295e-07, "loss": 0.0, "num_tokens": 2275967695.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1378.0, "completions/mean_length": 2004.466796875, "completions/mean_terminated_length": 809.7222290039062, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 7.176467657089233, "epoch": 0.9655172413793104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0332090845207793e-07, "loss": 0.0, "num_tokens": 2277084078.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1345.0, "completions/mean_length": 2016.15625, "completions/mean_terminated_length": 417.6000061035156, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.3072813749313354, "epoch": 0.9658586548310003, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0325619005441191e-07, "loss": 0.0, "num_tokens": 2278196526.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 2040.439453125, "completions/mean_terminated_length": 112.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.333136081695557, "epoch": 0.9662000682826903, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0319210620905063e-07, "loss": 0.0, "num_tokens": 2279322463.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1097.0, "completions/mean_length": 2042.416015625, "completions/mean_terminated_length": 618.5, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 7.258867025375366, "epoch": 0.9665414817343804, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0312865700701835e-07, "loss": 0.0, "num_tokens": 2280444132.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1103.0, "completions/mean_length": 2018.63671875, "completions/mean_terminated_length": 377.5555725097656, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 7.201134204864502, "epoch": 0.9668828951860703, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0306584253843792e-07, "loss": 0.0, "num_tokens": 2281555562.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1095.0, "completions/mean_length": 2031.583984375, "completions/mean_terminated_length": 367.0, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 7.3007752895355225, "epoch": 0.9672243086377603, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.030036628925307e-07, "loss": 0.0, "num_tokens": 2282665237.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 2037.755859375, "completions/mean_terminated_length": 299.66668701171875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.2775774002075195, "epoch": 0.9675657220894504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0294211815761628e-07, "loss": 0.0, "num_tokens": 2283784440.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 2042.255859375, "completions/mean_terminated_length": 577.5, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "entropy": 7.3236308097839355, "epoch": 0.9679071355411403, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.028812084211124e-07, "loss": 0.0, "num_tokens": 2284897931.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2024.31640625, "completions/mean_terminated_length": 1496.8182373046875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.231804370880127, "epoch": 0.9682485489928303, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0282093376953499e-07, "loss": 0.0, "num_tokens": 2286014221.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 2038.166015625, "completions/mean_terminated_length": 369.66668701171875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 7.2927937507629395, "epoch": 0.9685899624445203, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0276129428849773e-07, "loss": 0.0, "num_tokens": 2287134434.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1100.0, "completions/mean_length": 2034.21484375, "completions/mean_terminated_length": 636.4000244140625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 7.28345787525177, "epoch": 0.9689313758962103, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0270229006271222e-07, "loss": 0.0, "num_tokens": 2288258528.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1056.0, "completions/mean_length": 2038.544921875, "completions/mean_terminated_length": 434.3333435058594, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 7.30631959438324, "epoch": 0.9692727893479003, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0264392117598772e-07, "loss": 0.0, "num_tokens": 2289378599.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 2036.41796875, "completions/mean_terminated_length": 71.33333587646484, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 7.288433909416199, "epoch": 0.9696142027995903, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.02586187711231e-07, "loss": 0.0, "num_tokens": 2290503981.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 2033.693359375, "completions/mean_terminated_length": 216.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.286610126495361, "epoch": 0.9699556162512804, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0252908975044645e-07, "loss": 0.0, "num_tokens": 2291626192.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1960.0, "completions/mean_length": 1989.47265625, "completions/mean_terminated_length": 685.9091186523438, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.239510655403137, "epoch": 0.9702970297029703, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0247262737473563e-07, "loss": 0.0, "num_tokens": 2292730866.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2036.57421875, "completions/mean_terminated_length": 1723.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.08867084980011, "epoch": 0.9706384431546603, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0241680066429735e-07, "loss": 0.0, "num_tokens": 2293849080.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 2037.537109375, "completions/mean_terminated_length": 262.3333435058594, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 7.229819059371948, "epoch": 0.9709798566063503, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0236160969842753e-07, "loss": 0.0, "num_tokens": 2294972427.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 2029.0546875, "completions/mean_terminated_length": 108.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.293890118598938, "epoch": 0.9713212700580403, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0230705455551917e-07, "loss": 0.0, "num_tokens": 2296093031.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1763.0, "completions/mean_length": 2043.875, "completions/mean_terminated_length": 992.0, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "entropy": 7.2563252449035645, "epoch": 0.9716626835097303, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0225313531306198e-07, "loss": 0.0, "num_tokens": 2297217783.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 2018.484375, "completions/mean_terminated_length": 159.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.273962140083313, "epoch": 0.9720040969614203, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0219985204764262e-07, "loss": 0.0, "num_tokens": 2298329519.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 2034.046875, "completions/mean_terminated_length": 262.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.299933314323425, "epoch": 0.9723455104131102, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0214720483494425e-07, "loss": 0.0, "num_tokens": 2299453319.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1241.0, "completions/mean_length": 2001.197265625, "completions/mean_terminated_length": 716.7222290039062, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.196344971656799, "epoch": 0.9726869238648003, "frac_reward_zero_std": 0.96875, "grad_norm": 0.037016598331206726, "learning_rate": 1.0209519374974673e-07, "loss": -0.0016, "num_tokens": 2300563692.0, "reward": 0.033203125, "reward_std": 0.016010859981179237, "rewards/accuracy_reward/mean": 0.018145160749554634, "rewards/accuracy_reward/std": 0.1336110383272171, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 2040.4609375, "completions/mean_terminated_length": 118.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 7.306359887123108, "epoch": 0.9730283373164903, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0204381886592631e-07, "loss": 0.0, "num_tokens": 2301686200.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.93359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1950.880859375, "completions/mean_terminated_length": 585.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 7.154861688613892, "epoch": 0.9733697507681802, "frac_reward_zero_std": 0.9375, "grad_norm": 0.20689220656730967, "learning_rate": 1.0199308025645555e-07, "loss": -0.0015, "num_tokens": 2302774187.0, "reward": 0.06787109375, "reward_std": 0.022209208458662033, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.03076171875, "rewards/tag_count_reward/std": 0.11975188553333282, "step": 2851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1072.0, "completions/mean_length": 1978.607421875, "completions/mean_terminated_length": 356.1428527832031, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.241445064544678, "epoch": 0.9737111642198703, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0194297799340319e-07, "loss": 0.0, "num_tokens": 2303873234.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1916.0, "completions/mean_length": 2033.90625, "completions/mean_terminated_length": 845.3333740234375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 7.243670701980591, "epoch": 0.9740525776715603, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0189351214793437e-07, "loss": 0.0, "num_tokens": 2304995954.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 2026.658203125, "completions/mean_terminated_length": 226.83334350585938, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 7.260930299758911, "epoch": 0.9743939911232502, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0184468279030992e-07, "loss": 0.0, "num_tokens": 2306116227.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1767.0, "completions/mean_length": 2043.455078125, "completions/mean_terminated_length": 884.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.299185276031494, "epoch": 0.9747354045749402, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0179648998988694e-07, "loss": 0.0, "num_tokens": 2307239484.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 2039.47265625, "completions/mean_terminated_length": 592.6666870117188, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 7.2971802949905396, "epoch": 0.9750768180266303, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0174893381511803e-07, "loss": 0.0, "num_tokens": 2308359582.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 1993.21875, "completions/mean_terminated_length": 489.77777099609375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 7.201110363006592, "epoch": 0.9754182314783203, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0170201433355185e-07, "loss": 0.0, "num_tokens": 2309464798.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1165.0, "completions/mean_length": 2032.68359375, "completions/mean_terminated_length": 479.6000061035156, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 7.230119943618774, "epoch": 0.9757596449300102, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0165573161183246e-07, "loss": 0.0, "num_tokens": 2310581484.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 2030.71875, "completions/mean_terminated_length": 278.3999938964844, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.29001522064209, "epoch": 0.9761010583817002, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.016100857156996e-07, "loss": 0.0, "num_tokens": 2311705244.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 112.0, "completions/mean_length": 2044.21875, "completions/mean_terminated_length": 112.0, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 7.379701375961304, "epoch": 0.9764424718333903, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0156507670998842e-07, "loss": 0.0, "num_tokens": 2312826812.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 2041.640625, "completions/mean_terminated_length": 420.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 7.261292815208435, "epoch": 0.9767838852850802, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0152070465862951e-07, "loss": 0.0, "num_tokens": 2313947460.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1053.0, "completions/mean_length": 2023.947265625, "completions/mean_terminated_length": 288.71429443359375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.280720829963684, "epoch": 0.9771252987367702, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0147696962464861e-07, "loss": 0.0, "num_tokens": 2315064857.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1083.0, "completions/mean_length": 2028.2578125, "completions/mean_terminated_length": 363.3333435058594, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 7.2922985553741455, "epoch": 0.9774667121884603, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0143387167016674e-07, "loss": 0.0, "num_tokens": 2316180781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 2035.0, "completions/mean_terminated_length": 716.7999877929688, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.287100791931152, "epoch": 0.9778081256401502, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0139141085639992e-07, "loss": 0.0, "num_tokens": 2317300957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1053.0, "completions/mean_length": 2044.046875, "completions/mean_terminated_length": 1036.0, "completions/min_length": 1019.0, "completions/min_terminated_length": 1019.0, "entropy": 7.237639546394348, "epoch": 0.9781495390918402, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0134958724365934e-07, "loss": 0.0, "num_tokens": 2318417093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.955078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1978.4609375, "completions/mean_terminated_length": 500.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.201614141464233, "epoch": 0.9784909525435302, "frac_reward_zero_std": 0.96875, "grad_norm": 0.08919606875002076, "learning_rate": 1.01308400891351e-07, "loss": 0.0067, "num_tokens": 2319517137.0, "reward": 0.01513671875, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01513671875, "rewards/tag_count_reward/std": 0.08503700792789459, "step": 2866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 2036.822265625, "completions/mean_terminated_length": 140.33334350585938, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.3417418003082275, "epoch": 0.9788323659952202, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0126785185797567e-07, "loss": 0.0, "num_tokens": 2320635398.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.966796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2007.38671875, "completions/mean_terminated_length": 824.8235473632812, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "entropy": 7.149150848388672, "epoch": 0.9791737794469102, "frac_reward_zero_std": 0.96875, "grad_norm": 0.6570820108533104, "learning_rate": 1.01227940201129e-07, "loss": 0.0079, "num_tokens": 2321745148.0, "reward": 0.03369140625, "reward_std": 0.01775088720023632, "rewards/accuracy_reward/mean": 0.02016128972172737, "rewards/accuracy_reward/std": 0.14069372415542603, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01416015625, "rewards/tag_count_reward/std": 0.08078429847955704, "step": 2868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1928.0, "completions/mean_length": 2025.00390625, "completions/mean_terminated_length": 576.25, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 7.312402248382568, "epoch": 0.9795151928986002, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0118866597750134e-07, "loss": 0.0, "num_tokens": 2322855454.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 2029.33984375, "completions/mean_terminated_length": 137.1999969482422, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.235981464385986, "epoch": 0.9798566063502901, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.011500292428775e-07, "loss": 0.0, "num_tokens": 2323981708.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1256.0, "completions/mean_length": 2040.701171875, "completions/mean_terminated_length": 802.3333740234375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 7.304010987281799, "epoch": 0.9801980198019802, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0111203005213692e-07, "loss": 0.0, "num_tokens": 2325105811.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 2040.716796875, "completions/mean_terminated_length": 183.5, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 7.287771940231323, "epoch": 0.9805394332536702, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0107466845925335e-07, "loss": 0.0, "num_tokens": 2326227234.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 2022.44140625, "completions/mean_terminated_length": 178.57144165039062, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.241892337799072, "epoch": 0.9808808467053602, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0103794451729503e-07, "loss": 0.0, "num_tokens": 2327337524.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1294.0, "completions/mean_length": 2027.126953125, "completions/mean_terminated_length": 521.2857666015625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.311010479927063, "epoch": 0.9812222601570502, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0100185827842445e-07, "loss": 0.0, "num_tokens": 2328449957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1053.0, "completions/mean_length": 1981.8203125, "completions/mean_terminated_length": 434.4761962890625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.205188751220703, "epoch": 0.9815636736087402, "frac_reward_zero_std": 0.96875, "grad_norm": 0.023159887213347337, "learning_rate": 1.009664097938983e-07, "loss": 0.0002, "num_tokens": 2329550681.0, "reward": 0.017578125, "reward_std": 0.0078125, "rewards/accuracy_reward/mean": 0.002016128972172737, "rewards/accuracy_reward/std": 0.044901326298713684, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.978515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1241.0, "completions/mean_length": 2013.576171875, "completions/mean_terminated_length": 445.727294921875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 7.301867127418518, "epoch": 0.9819050870604302, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0093159911406735e-07, "loss": 0.0, "num_tokens": 2330655632.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1587.0, "completions/mean_length": 2031.306640625, "completions/mean_terminated_length": 338.6000061035156, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 7.263220429420471, "epoch": 0.9822465005121201, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0089742628837653e-07, "loss": 0.0, "num_tokens": 2331771741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1273.0, "completions/mean_length": 2029.98046875, "completions/mean_terminated_length": 510.3333435058594, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.322906732559204, "epoch": 0.9825879139638102, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0086389136536468e-07, "loss": 0.0, "num_tokens": 2332887347.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 2028.400390625, "completions/mean_terminated_length": 41.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.313253045082092, "epoch": 0.9829293274155002, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0083099439266465e-07, "loss": 0.0, "num_tokens": 2334003520.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1210.0, "completions/mean_length": 2020.525390625, "completions/mean_terminated_length": 485.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.294165134429932, "epoch": 0.9832707408671901, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0079873541700307e-07, "loss": 0.0, "num_tokens": 2335106349.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 2037.23828125, "completions/mean_terminated_length": 211.33334350585938, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 7.263872861862183, "epoch": 0.9836121543188802, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0076711448420045e-07, "loss": 0.0, "num_tokens": 2336231559.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1611.0, "completions/mean_length": 2036.49609375, "completions/mean_terminated_length": 575.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.317586898803711, "epoch": 0.9839535677705702, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0073613163917094e-07, "loss": 0.0, "num_tokens": 2337348133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1234.0, "completions/mean_length": 2023.953125, "completions/mean_terminated_length": 509.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.163584113121033, "epoch": 0.9842949812222601, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0070578692592245e-07, "loss": 0.0, "num_tokens": 2338466765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1072.0, "completions/mean_length": 2023.427734375, "completions/mean_terminated_length": 475.375, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.218203783035278, "epoch": 0.9846363946739501, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0067608038755635e-07, "loss": 0.0, "num_tokens": 2339588776.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 2037.37890625, "completions/mean_terminated_length": 235.33334350585938, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 7.266531944274902, "epoch": 0.9849778081256402, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0064701206626763e-07, "loss": 0.0, "num_tokens": 2340715018.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.978515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 2008.87109375, "completions/mean_terminated_length": 226.72727966308594, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.236600756645203, "epoch": 0.9853192215773301, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0061858200334487e-07, "loss": 0.0, "num_tokens": 2341824072.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.982421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1094.0, "completions/mean_length": 2018.8984375, "completions/mean_terminated_length": 392.4444580078125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 7.295334696769714, "epoch": 0.9856606350290201, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0059079023916987e-07, "loss": 0.0, "num_tokens": 2342932836.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1086.0, "completions/mean_length": 2025.630859375, "completions/mean_terminated_length": 411.8571472167969, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 7.350026965141296, "epoch": 0.9860020484807102, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0056363681321794e-07, "loss": 0.0, "num_tokens": 2344047191.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1190.0, "completions/mean_length": 2026.279296875, "completions/mean_terminated_length": 459.2857360839844, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.292983293533325, "epoch": 0.9863434619324002, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0053712176405765e-07, "loss": 0.0, "num_tokens": 2345167318.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1135.0, "completions/mean_length": 2033.474609375, "completions/mean_terminated_length": 560.6000366210938, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 7.267365455627441, "epoch": 0.9866848753840901, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0051124512935078e-07, "loss": 0.0, "num_tokens": 2346284249.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 2037.482421875, "completions/mean_terminated_length": 253.0, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 7.250683665275574, "epoch": 0.9870262888357801, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0048600694585238e-07, "loss": 0.0, "num_tokens": 2347405392.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1028.0, "completions/mean_length": 2040.19921875, "completions/mean_terminated_length": 716.6666870117188, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 7.30138635635376, "epoch": 0.9873677022874702, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0046140724941062e-07, "loss": 0.0, "num_tokens": 2348528406.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 2040.068359375, "completions/mean_terminated_length": 17.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 7.221161484718323, "epoch": 0.9877091157391601, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0043744607496673e-07, "loss": 0.0, "num_tokens": 2349656089.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 2031.359375, "completions/mean_terminated_length": 344.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.231658220291138, "epoch": 0.9880505291908501, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0041412345655508e-07, "loss": 0.0, "num_tokens": 2350771921.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 2044.384765625, "completions/mean_terminated_length": 197.0, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 7.312433481216431, "epoch": 0.9883919426425402, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0039143942730297e-07, "loss": 0.0, "num_tokens": 2351897718.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "entropy": 7.27734375, "epoch": 0.9887333560942301, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0036939401943061e-07, "loss": 0.0, "num_tokens": 2353016182.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 2038.0625, "completions/mean_terminated_length": 352.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 7.262361288070679, "epoch": 0.9890747695459201, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0034798726425117e-07, "loss": 0.0, "num_tokens": 2354133062.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1267.0, "completions/mean_length": 2033.048828125, "completions/mean_terminated_length": 517.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 7.241049408912659, "epoch": 0.9894161829976101, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0032721919217075e-07, "loss": 0.0, "num_tokens": 2355262175.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1025.0, "completions/mean_length": 2038.982421875, "completions/mean_terminated_length": 509.0, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 7.28889262676239, "epoch": 0.9897575964493001, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0030708983268821e-07, "loss": 0.0, "num_tokens": 2356376742.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1942.0, "completions/mean_length": 2036.646484375, "completions/mean_terminated_length": 594.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 7.249188661575317, "epoch": 0.9900990099009901, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0028759921439508e-07, "loss": 0.0, "num_tokens": 2357493569.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 2020.138671875, "completions/mean_terminated_length": 264.875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.275579214096069, "epoch": 0.9904404233526801, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0026874736497576e-07, "loss": 0.0, "num_tokens": 2358608632.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 2036.958984375, "completions/mean_terminated_length": 163.6666717529297, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.270548582077026, "epoch": 0.99078183680437, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0025053431120732e-07, "loss": 0.0, "num_tokens": 2359729603.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1582.0, "completions/mean_length": 2007.458984375, "completions/mean_terminated_length": 894.8333129882812, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 7.158893585205078, "epoch": 0.9911232502560601, "frac_reward_zero_std": 0.96875, "grad_norm": 0.039585020724485456, "learning_rate": 1.0023296007895951e-07, "loss": -0.0022, "num_tokens": 2360841710.0, "reward": 0.01513671875, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01513671875, "rewards/tag_count_reward/std": 0.08503700792789459, "step": 2903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 2032.60546875, "completions/mean_terminated_length": 77.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.292699933052063, "epoch": 0.9914646637077501, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0021602469319456e-07, "loss": 0.0, "num_tokens": 2361961684.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 2040.037109375, "completions/mean_terminated_length": 9.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 7.299988031387329, "epoch": 0.9918060771594401, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0019972817796753e-07, "loss": 0.0, "num_tokens": 2363075159.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 2034.646484375, "completions/mean_terminated_length": 338.75, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 7.294543027877808, "epoch": 0.9921474906111301, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.001840705564258e-07, "loss": 0.0, "num_tokens": 2364196610.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 2037.423828125, "completions/mean_terminated_length": 965.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 7.259544372558594, "epoch": 0.9924889040628201, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0016905185080935e-07, "loss": 0.0, "num_tokens": 2365318011.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 2045.228515625, "completions/mean_terminated_length": 629.0, "completions/min_length": 629.0, "completions/min_terminated_length": 629.0, "entropy": 7.254974603652954, "epoch": 0.9928303175145101, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0015467208245076e-07, "loss": 0.0, "num_tokens": 2366439328.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 2027.419921875, "completions/mean_terminated_length": 291.8333435058594, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.281938910484314, "epoch": 0.9931717309662, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0014093127177493e-07, "loss": 0.0, "num_tokens": 2367557879.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1215.0, "completions/mean_length": 2031.453125, "completions/mean_terminated_length": 353.6000061035156, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 7.307607173919678, "epoch": 0.9935131444178901, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0012782943829913e-07, "loss": 0.0, "num_tokens": 2368683935.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1755.0, "completions/mean_length": 2005.25390625, "completions/mean_terminated_length": 1005.8095703125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 7.150937557220459, "epoch": 0.9938545578695801, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0011536660063326e-07, "loss": 0.0, "num_tokens": 2369793617.0, "reward": 0.015625, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.08708140254020691, "step": 2911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1086.0, "completions/mean_length": 2034.6328125, "completions/mean_terminated_length": 337.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.308088183403015, "epoch": 0.99419597132127, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0010354277647939e-07, "loss": 0.0, "num_tokens": 2370910565.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 2032.9140625, "completions/mean_terminated_length": 117.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 7.281537294387817, "epoch": 0.99453738477296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0009235798263205e-07, "loss": 0.0, "num_tokens": 2372021993.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 2037.56640625, "completions/mean_terminated_length": 267.3333435058594, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 7.2945228815078735, "epoch": 0.9948787982246501, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.00081812234978e-07, "loss": 0.0, "num_tokens": 2373142635.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1077.0, "completions/mean_length": 2029.87890625, "completions/mean_terminated_length": 501.66668701171875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.304256558418274, "epoch": 0.99522021167634, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0007190554849646e-07, "loss": 0.0, "num_tokens": 2374253933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1817.0, "completions/mean_length": 2032.599609375, "completions/mean_terminated_length": 471.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 7.221060156822205, "epoch": 0.99556162512803, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0006263793725872e-07, "loss": 0.0, "num_tokens": 2375380160.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.986328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1027.0, "completions/mean_length": 2025.025390625, "completions/mean_terminated_length": 367.5714416503906, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 7.295747995376587, "epoch": 0.9959030385797201, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0005400941442848e-07, "loss": 0.0, "num_tokens": 2376493389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 2044.453125, "completions/mean_terminated_length": 232.0, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "entropy": 7.291386365890503, "epoch": 0.99624445203141, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0004601999226164e-07, "loss": 0.0, "num_tokens": 2377611717.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1727.0, "completions/mean_length": 2024.361328125, "completions/mean_terminated_length": 535.125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.255679488182068, "epoch": 0.9965858654831, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0003866968210636e-07, "loss": 0.0, "num_tokens": 2378724846.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.958984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 2031.685546875, "completions/mean_terminated_length": 1650.2381591796875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 7.086827397346497, "epoch": 0.99692727893479, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0003195849440295e-07, "loss": 0.0, "num_tokens": 2379855213.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.994140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1078.0, "completions/mean_length": 2038.56640625, "completions/mean_terminated_length": 438.0, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 7.26067578792572, "epoch": 0.9972686923864801, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0002588643868397e-07, "loss": 0.0, "num_tokens": 2380974463.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1104.0, "completions/mean_length": 2042.28515625, "completions/mean_terminated_length": 585.0, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 7.3161842823028564, "epoch": 0.99761010583817, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0002045352357407e-07, "loss": 0.0, "num_tokens": 2382097937.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1789.0, "completions/mean_length": 2028.490234375, "completions/mean_terminated_length": 383.16668701171875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.322476863861084, "epoch": 0.99795151928986, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.000156597567902e-07, "loss": 0.0, "num_tokens": 2383206748.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1297.0, "completions/mean_length": 2036.69140625, "completions/mean_terminated_length": 600.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 7.271275043487549, "epoch": 0.9982929327415501, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.000115051451414e-07, "loss": 0.0, "num_tokens": 2384320302.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.990234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1082.0, "completions/mean_length": 2031.576171875, "completions/mean_terminated_length": 366.20001220703125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 7.280023097991943, "epoch": 0.99863434619324, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0000798969452879e-07, "loss": 0.0, "num_tokens": 2385440501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.998046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 2044.2734375, "completions/mean_terminated_length": 140.0, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 7.291784763336182, "epoch": 0.99897575964493, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0000511340994572e-07, "loss": 0.0, "num_tokens": 2386561777.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1823.0, "completions/mean_length": 2033.671875, "completions/mean_terminated_length": 825.3333740234375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.3543384075164795, "epoch": 0.99931717309662, "frac_reward_zero_std": 0.96875, "grad_norm": 0.22563679541337495, "learning_rate": 1.0000287629547764e-07, "loss": -0.0002, "num_tokens": 2387676937.0, "reward": 0.00048828125, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00048828125, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 2032.533203125, "completions/mean_terminated_length": 68.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 7.224379777908325, "epoch": 0.99965858654831, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0000127835430222e-07, "loss": 0.0, "num_tokens": 2388787818.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1275.0, "completions/mean_length": 2036.9609375, "completions/mean_terminated_length": 635.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 7.232345819473267, "epoch": 1.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0000031958868901e-07, "loss": 0.0, "num_tokens": 2389912870.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2929 }, { "epoch": 1.0, "step": 2929, "total_flos": 0.0, "train_loss": 0.021358598244649377, "train_runtime": 55166.4157, "train_samples_per_second": 1.699, "train_steps_per_second": 0.053 } ], "logging_steps": 1, "max_steps": 2929, "num_input_tokens_seen": 2389912870, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }